#1:Reading the data set
diabetic_largedata<-read.csv("diabetic_data.csv",header = T, sep=",",na.strings = "?",stringsAsFactors=T)
head(diabetic_largedata)
## encounter_id patient_nbr race gender age weight
## 1 2278392 8222157 Caucasian Female [0-10) <NA>
## 2 149190 55629189 Caucasian Female [10-20) <NA>
## 3 64410 86047875 AfricanAmerican Female [20-30) <NA>
## 4 500364 82442376 Caucasian Male [30-40) <NA>
## 5 16680 42519267 Caucasian Male [40-50) <NA>
## 6 35754 82637451 Caucasian Male [50-60) <NA>
## admission_type_id discharge_disposition_id admission_source_id
## 1 6 25 1
## 2 1 1 7
## 3 1 1 7
## 4 1 1 7
## 5 1 1 7
## 6 2 1 2
## time_in_hospital payer_code medical_specialty num_lab_procedures
## 1 1 <NA> Pediatrics-Endocrinology 41
## 2 3 <NA> <NA> 59
## 3 2 <NA> <NA> 11
## 4 2 <NA> <NA> 44
## 5 1 <NA> <NA> 51
## 6 3 <NA> <NA> 31
## num_procedures num_medications number_outpatient number_emergency
## 1 0 1 0 0
## 2 0 18 0 0
## 3 5 13 2 0
## 4 1 16 0 0
## 5 0 8 0 0
## 6 6 16 0 0
## number_inpatient diag_1 diag_2 diag_3 number_diagnoses max_glu_serum
## 1 0 250.83 <NA> <NA> 1 None
## 2 0 276 250.01 255 9 None
## 3 1 648 250 V27 6 None
## 4 0 8 250.43 403 7 None
## 5 0 197 157 250 5 None
## 6 0 414 411 250 9 None
## A1Cresult metformin repaglinide nateglinide chlorpropamide glimepiride
## 1 None No No No No No
## 2 None No No No No No
## 3 None No No No No No
## 4 None No No No No No
## 5 None No No No No No
## 6 None No No No No No
## acetohexamide glipizide glyburide tolbutamide pioglitazone rosiglitazone
## 1 No No No No No No
## 2 No No No No No No
## 3 No Steady No No No No
## 4 No No No No No No
## 5 No Steady No No No No
## 6 No No No No No No
## acarbose miglitol troglitazone tolazamide examide citoglipton insulin
## 1 No No No No No No No
## 2 No No No No No No Up
## 3 No No No No No No No
## 4 No No No No No No Up
## 5 No No No No No No Steady
## 6 No No No No No No Steady
## glyburide.metformin glipizide.metformin glimepiride.pioglitazone
## 1 No No No
## 2 No No No
## 3 No No No
## 4 No No No
## 5 No No No
## 6 No No No
## metformin.rosiglitazone metformin.pioglitazone change diabetesMed readmitted
## 1 No No No No NO
## 2 No No Ch Yes >30
## 3 No No No Yes NO
## 4 No No Ch Yes NO
## 5 No No Ch Yes NO
## 6 No No No Yes >30
**EDA
plot(sapply(diabetic_largedata,function(x)sum(is.na(x))))
sapply(diabetic_largedata,function(x)sum(is.na(x)))
## encounter_id patient_nbr race
## 0 0 2273
## gender age weight
## 0 0 98569
## admission_type_id discharge_disposition_id admission_source_id
## 0 0 0
## time_in_hospital payer_code medical_specialty
## 0 40256 49949
## num_lab_procedures num_procedures num_medications
## 0 0 0
## number_outpatient number_emergency number_inpatient
## 0 0 0
## diag_1 diag_2 diag_3
## 21 358 1423
## number_diagnoses max_glu_serum A1Cresult
## 0 0 0
## metformin repaglinide nateglinide
## 0 0 0
## chlorpropamide glimepiride acetohexamide
## 0 0 0
## glipizide glyburide tolbutamide
## 0 0 0
## pioglitazone rosiglitazone acarbose
## 0 0 0
## miglitol troglitazone tolazamide
## 0 0 0
## examide citoglipton insulin
## 0 0 0
## glyburide.metformin glipizide.metformin glimepiride.pioglitazone
## 0 0 0
## metformin.rosiglitazone metformin.pioglitazone change
## 0 0 0
## diabetesMed readmitted
## 0 0
str(diabetic_largedata)
## 'data.frame': 101766 obs. of 50 variables:
## $ encounter_id : int 2278392 149190 64410 500364 16680 35754 55842 63768 12522 15738 ...
## $ patient_nbr : int 8222157 55629189 86047875 82442376 42519267 82637451 84259809 114882984 48330783 63555939 ...
## $ race : Factor w/ 5 levels "AfricanAmerican",..: 3 3 1 3 3 3 3 3 3 3 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 1 1 1 2 2 2 2 2 1 1 ...
## $ age : Factor w/ 10 levels "[0-10)","[10-20)",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ weight : Factor w/ 9 levels "[0-25)","[100-125)",..: NA NA NA NA NA NA NA NA NA NA ...
## $ admission_type_id : int 6 1 1 1 1 2 3 1 2 3 ...
## $ discharge_disposition_id: int 25 1 1 1 1 1 1 1 1 3 ...
## $ admission_source_id : int 1 7 7 7 7 2 2 7 4 4 ...
## $ time_in_hospital : int 1 3 2 2 1 3 4 5 13 12 ...
## $ payer_code : Factor w/ 17 levels "BC","CH","CM",..: NA NA NA NA NA NA NA NA NA NA ...
## $ medical_specialty : Factor w/ 72 levels "AllergyandImmunology",..: 38 NA NA NA NA NA NA NA NA 19 ...
## $ num_lab_procedures : int 41 59 11 44 51 31 70 73 68 33 ...
## $ num_procedures : int 0 0 5 1 0 6 1 0 2 3 ...
## $ num_medications : int 1 18 13 16 8 16 21 12 28 18 ...
## $ number_outpatient : int 0 0 2 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 1 0 0 0 0 0 0 0 ...
## $ diag_1 : Factor w/ 716 levels "10","11","110",..: 125 144 455 555 55 264 264 277 253 283 ...
## $ diag_2 : Factor w/ 748 levels "11","110","111",..: NA 80 79 98 25 247 247 315 261 47 ...
## $ diag_3 : Factor w/ 789 levels "11","110","111",..: NA 122 767 249 87 87 771 87 230 318 ...
## $ number_diagnoses : int 1 9 6 7 5 9 7 8 8 8 ...
## $ max_glu_serum : Factor w/ 4 levels ">200",">300",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ A1Cresult : Factor w/ 4 levels ">7",">8","None",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ metformin : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
## $ repaglinide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ nateglinide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ chlorpropamide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ glimepiride : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
## $ acetohexamide : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ glipizide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 3 2 3 2 2 2 3 2 ...
## $ glyburide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 3 2 2 ...
## $ tolbutamide : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ pioglitazone : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ rosiglitazone : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 3 ...
## $ acarbose : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ miglitol : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ troglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ tolazamide : Factor w/ 3 levels "No","Steady",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ examide : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
## $ citoglipton : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
## $ insulin : Factor w/ 4 levels "Down","No","Steady",..: 2 4 2 4 3 3 3 2 3 3 ...
## $ glyburide.metformin : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ glipizide.metformin : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ glimepiride.pioglitazone: Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ metformin.rosiglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ metformin.pioglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ change : Factor w/ 2 levels "Ch","No": 2 1 2 1 1 2 1 2 1 1 ...
## $ diabetesMed : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 2 2 2 ...
## $ readmitted : Factor w/ 3 levels "<30",">30","NO": 3 2 3 3 3 2 3 2 3 3 ...
#2:Checking the Missing values
library(DataExplorer)
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
plot_missing(diabetic_largedata)
md.pattern(diabetic_largedata,plot = TRUE, rotate.names = TRUE)
## encounter_id patient_nbr gender age admission_type_id
## 1043 1 1 1 1 1
## 25712 1 1 1 1 1
## 1177 1 1 1 1 1
## 31197 1 1 1 1 1
## 513 1 1 1 1 1
## 22467 1 1 1 1 1
## 303 1 1 1 1 1
## 15641 1 1 1 1 1
## 42 1 1 1 1 1
## 681 1 1 1 1 1
## 42 1 1 1 1 1
## 895 1 1 1 1 1
## 33 1 1 1 1 1
## 271 1 1 1 1 1
## 18 1 1 1 1 1
## 209 1 1 1 1 1
## 5 1 1 1 1 1
## 268 1 1 1 1 1
## 2 1 1 1 1 1
## 192 1 1 1 1 1
## 1 1 1 1 1 1
## 467 1 1 1 1 1
## 4 1 1 1 1 1
## 147 1 1 1 1 1
## 34 1 1 1 1 1
## 14 1 1 1 1 1
## 1 1 1 1 1 1
## 7 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 39 1 1 1 1 1
## 29 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 6 1 1 1 1 1
## 3 1 1 1 1 1
## 65 1 1 1 1 1
## 39 1 1 1 1 1
## 3 1 1 1 1 1
## 127 1 1 1 1 1
## 3 1 1 1 1 1
## 22 1 1 1 1 1
## 1 1 1 1 1 1
## 11 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 5 1 1 1 1 1
## 5 1 1 1 1 1
## 5 1 1 1 1 1
## 3 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 1
## 0 0 0 0 0
## discharge_disposition_id admission_source_id time_in_hospital
## 1043 1 1 1
## 25712 1 1 1
## 1177 1 1 1
## 31197 1 1 1
## 513 1 1 1
## 22467 1 1 1
## 303 1 1 1
## 15641 1 1 1
## 42 1 1 1
## 681 1 1 1
## 42 1 1 1
## 895 1 1 1
## 33 1 1 1
## 271 1 1 1
## 18 1 1 1
## 209 1 1 1
## 5 1 1 1
## 268 1 1 1
## 2 1 1 1
## 192 1 1 1
## 1 1 1 1
## 467 1 1 1
## 4 1 1 1
## 147 1 1 1
## 34 1 1 1
## 14 1 1 1
## 1 1 1 1
## 7 1 1 1
## 1 1 1 1
## 1 1 1 1
## 39 1 1 1
## 29 1 1 1
## 1 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 6 1 1 1
## 3 1 1 1
## 65 1 1 1
## 39 1 1 1
## 3 1 1 1
## 127 1 1 1
## 3 1 1 1
## 22 1 1 1
## 1 1 1 1
## 11 1 1 1
## 3 1 1 1
## 1 1 1 1
## 5 1 1 1
## 5 1 1 1
## 5 1 1 1
## 3 1 1 1
## 2 1 1 1
## 1 1 1 1
## 0 0 0
## num_lab_procedures num_procedures num_medications number_outpatient
## 1043 1 1 1 1
## 25712 1 1 1 1
## 1177 1 1 1 1
## 31197 1 1 1 1
## 513 1 1 1 1
## 22467 1 1 1 1
## 303 1 1 1 1
## 15641 1 1 1 1
## 42 1 1 1 1
## 681 1 1 1 1
## 42 1 1 1 1
## 895 1 1 1 1
## 33 1 1 1 1
## 271 1 1 1 1
## 18 1 1 1 1
## 209 1 1 1 1
## 5 1 1 1 1
## 268 1 1 1 1
## 2 1 1 1 1
## 192 1 1 1 1
## 1 1 1 1 1
## 467 1 1 1 1
## 4 1 1 1 1
## 147 1 1 1 1
## 34 1 1 1 1
## 14 1 1 1 1
## 1 1 1 1 1
## 7 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 39 1 1 1 1
## 29 1 1 1 1
## 1 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 6 1 1 1 1
## 3 1 1 1 1
## 65 1 1 1 1
## 39 1 1 1 1
## 3 1 1 1 1
## 127 1 1 1 1
## 3 1 1 1 1
## 22 1 1 1 1
## 1 1 1 1 1
## 11 1 1 1 1
## 3 1 1 1 1
## 1 1 1 1 1
## 5 1 1 1 1
## 5 1 1 1 1
## 5 1 1 1 1
## 3 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## number_emergency number_inpatient number_diagnoses max_glu_serum
## 1043 1 1 1 1
## 25712 1 1 1 1
## 1177 1 1 1 1
## 31197 1 1 1 1
## 513 1 1 1 1
## 22467 1 1 1 1
## 303 1 1 1 1
## 15641 1 1 1 1
## 42 1 1 1 1
## 681 1 1 1 1
## 42 1 1 1 1
## 895 1 1 1 1
## 33 1 1 1 1
## 271 1 1 1 1
## 18 1 1 1 1
## 209 1 1 1 1
## 5 1 1 1 1
## 268 1 1 1 1
## 2 1 1 1 1
## 192 1 1 1 1
## 1 1 1 1 1
## 467 1 1 1 1
## 4 1 1 1 1
## 147 1 1 1 1
## 34 1 1 1 1
## 14 1 1 1 1
## 1 1 1 1 1
## 7 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 39 1 1 1 1
## 29 1 1 1 1
## 1 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 6 1 1 1 1
## 3 1 1 1 1
## 65 1 1 1 1
## 39 1 1 1 1
## 3 1 1 1 1
## 127 1 1 1 1
## 3 1 1 1 1
## 22 1 1 1 1
## 1 1 1 1 1
## 11 1 1 1 1
## 3 1 1 1 1
## 1 1 1 1 1
## 5 1 1 1 1
## 5 1 1 1 1
## 5 1 1 1 1
## 3 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## A1Cresult metformin repaglinide nateglinide chlorpropamide glimepiride
## 1043 1 1 1 1 1 1
## 25712 1 1 1 1 1 1
## 1177 1 1 1 1 1 1
## 31197 1 1 1 1 1 1
## 513 1 1 1 1 1 1
## 22467 1 1 1 1 1 1
## 303 1 1 1 1 1 1
## 15641 1 1 1 1 1 1
## 42 1 1 1 1 1 1
## 681 1 1 1 1 1 1
## 42 1 1 1 1 1 1
## 895 1 1 1 1 1 1
## 33 1 1 1 1 1 1
## 271 1 1 1 1 1 1
## 18 1 1 1 1 1 1
## 209 1 1 1 1 1 1
## 5 1 1 1 1 1 1
## 268 1 1 1 1 1 1
## 2 1 1 1 1 1 1
## 192 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 467 1 1 1 1 1 1
## 4 1 1 1 1 1 1
## 147 1 1 1 1 1 1
## 34 1 1 1 1 1 1
## 14 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 7 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 39 1 1 1 1 1 1
## 29 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 65 1 1 1 1 1 1
## 39 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 127 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 22 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 11 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1
## 5 1 1 1 1 1 1
## 5 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 2 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 0 0 0 0 0 0
## acetohexamide glipizide glyburide tolbutamide pioglitazone rosiglitazone
## 1043 1 1 1 1 1 1
## 25712 1 1 1 1 1 1
## 1177 1 1 1 1 1 1
## 31197 1 1 1 1 1 1
## 513 1 1 1 1 1 1
## 22467 1 1 1 1 1 1
## 303 1 1 1 1 1 1
## 15641 1 1 1 1 1 1
## 42 1 1 1 1 1 1
## 681 1 1 1 1 1 1
## 42 1 1 1 1 1 1
## 895 1 1 1 1 1 1
## 33 1 1 1 1 1 1
## 271 1 1 1 1 1 1
## 18 1 1 1 1 1 1
## 209 1 1 1 1 1 1
## 5 1 1 1 1 1 1
## 268 1 1 1 1 1 1
## 2 1 1 1 1 1 1
## 192 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 467 1 1 1 1 1 1
## 4 1 1 1 1 1 1
## 147 1 1 1 1 1 1
## 34 1 1 1 1 1 1
## 14 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 7 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 39 1 1 1 1 1 1
## 29 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 65 1 1 1 1 1 1
## 39 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 127 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 22 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 11 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1
## 5 1 1 1 1 1 1
## 5 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 2 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 0 0 0 0 0 0
## acarbose miglitol troglitazone tolazamide examide citoglipton insulin
## 1043 1 1 1 1 1 1 1
## 25712 1 1 1 1 1 1 1
## 1177 1 1 1 1 1 1 1
## 31197 1 1 1 1 1 1 1
## 513 1 1 1 1 1 1 1
## 22467 1 1 1 1 1 1 1
## 303 1 1 1 1 1 1 1
## 15641 1 1 1 1 1 1 1
## 42 1 1 1 1 1 1 1
## 681 1 1 1 1 1 1 1
## 42 1 1 1 1 1 1 1
## 895 1 1 1 1 1 1 1
## 33 1 1 1 1 1 1 1
## 271 1 1 1 1 1 1 1
## 18 1 1 1 1 1 1 1
## 209 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1 1
## 268 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1
## 192 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 467 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1
## 147 1 1 1 1 1 1 1
## 34 1 1 1 1 1 1 1
## 14 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 7 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 39 1 1 1 1 1 1 1
## 29 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1
## 65 1 1 1 1 1 1 1
## 39 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1
## 127 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1
## 22 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 11 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## glyburide.metformin glipizide.metformin glimepiride.pioglitazone
## 1043 1 1 1
## 25712 1 1 1
## 1177 1 1 1
## 31197 1 1 1
## 513 1 1 1
## 22467 1 1 1
## 303 1 1 1
## 15641 1 1 1
## 42 1 1 1
## 681 1 1 1
## 42 1 1 1
## 895 1 1 1
## 33 1 1 1
## 271 1 1 1
## 18 1 1 1
## 209 1 1 1
## 5 1 1 1
## 268 1 1 1
## 2 1 1 1
## 192 1 1 1
## 1 1 1 1
## 467 1 1 1
## 4 1 1 1
## 147 1 1 1
## 34 1 1 1
## 14 1 1 1
## 1 1 1 1
## 7 1 1 1
## 1 1 1 1
## 1 1 1 1
## 39 1 1 1
## 29 1 1 1
## 1 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 6 1 1 1
## 3 1 1 1
## 65 1 1 1
## 39 1 1 1
## 3 1 1 1
## 127 1 1 1
## 3 1 1 1
## 22 1 1 1
## 1 1 1 1
## 11 1 1 1
## 3 1 1 1
## 1 1 1 1
## 5 1 1 1
## 5 1 1 1
## 5 1 1 1
## 3 1 1 1
## 2 1 1 1
## 1 1 1 1
## 0 0 0
## metformin.rosiglitazone metformin.pioglitazone change diabetesMed
## 1043 1 1 1 1
## 25712 1 1 1 1
## 1177 1 1 1 1
## 31197 1 1 1 1
## 513 1 1 1 1
## 22467 1 1 1 1
## 303 1 1 1 1
## 15641 1 1 1 1
## 42 1 1 1 1
## 681 1 1 1 1
## 42 1 1 1 1
## 895 1 1 1 1
## 33 1 1 1 1
## 271 1 1 1 1
## 18 1 1 1 1
## 209 1 1 1 1
## 5 1 1 1 1
## 268 1 1 1 1
## 2 1 1 1 1
## 192 1 1 1 1
## 1 1 1 1 1
## 467 1 1 1 1
## 4 1 1 1 1
## 147 1 1 1 1
## 34 1 1 1 1
## 14 1 1 1 1
## 1 1 1 1 1
## 7 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 39 1 1 1 1
## 29 1 1 1 1
## 1 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 6 1 1 1 1
## 3 1 1 1 1
## 65 1 1 1 1
## 39 1 1 1 1
## 3 1 1 1 1
## 127 1 1 1 1
## 3 1 1 1 1
## 22 1 1 1 1
## 1 1 1 1 1
## 11 1 1 1 1
## 3 1 1 1 1
## 1 1 1 1 1
## 5 1 1 1 1
## 5 1 1 1 1
## 5 1 1 1 1
## 3 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## readmitted diag_1 diag_2 diag_3 race payer_code medical_specialty weight
## 1043 1 1 1 1 1 1 1 1
## 25712 1 1 1 1 1 1 1 0
## 1177 1 1 1 1 1 1 0 1
## 31197 1 1 1 1 1 1 0 0
## 513 1 1 1 1 1 0 1 1
## 22467 1 1 1 1 1 0 1 0
## 303 1 1 1 1 1 0 0 1
## 15641 1 1 1 1 1 0 0 0
## 42 1 1 1 1 0 1 1 1
## 681 1 1 1 1 0 1 1 0
## 42 1 1 1 1 0 1 0 1
## 895 1 1 1 1 0 1 0 0
## 33 1 1 1 1 0 0 1 1
## 271 1 1 1 1 0 0 1 0
## 18 1 1 1 1 0 0 0 1
## 209 1 1 1 1 0 0 0 0
## 5 1 1 1 0 1 1 1 1
## 268 1 1 1 0 1 1 1 0
## 2 1 1 1 0 1 1 0 1
## 192 1 1 1 0 1 1 0 0
## 1 1 1 1 0 1 0 1 1
## 467 1 1 1 0 1 0 1 0
## 4 1 1 1 0 1 0 0 1
## 147 1 1 1 0 1 0 0 0
## 34 1 1 1 0 0 1 1 0
## 14 1 1 1 0 0 1 0 0
## 1 1 1 1 0 0 0 1 1
## 7 1 1 1 0 0 0 1 0
## 1 1 1 1 0 0 0 0 1
## 1 1 1 1 0 0 0 0 0
## 39 1 1 0 1 1 1 1 0
## 29 1 1 0 1 1 1 0 0
## 1 1 1 0 1 1 0 1 1
## 2 1 1 0 1 1 0 1 0
## 1 1 1 0 1 1 0 0 1
## 1 1 1 0 1 1 0 0 0
## 6 1 1 0 1 0 1 1 0
## 3 1 1 0 0 1 1 1 1
## 65 1 1 0 0 1 1 1 0
## 39 1 1 0 0 1 1 0 0
## 3 1 1 0 0 1 0 1 1
## 127 1 1 0 0 1 0 1 0
## 3 1 1 0 0 1 0 0 1
## 22 1 1 0 0 1 0 0 0
## 1 1 1 0 0 0 1 1 1
## 11 1 1 0 0 0 1 1 0
## 3 1 1 0 0 0 1 0 0
## 1 1 1 0 0 0 0 1 0
## 5 1 0 1 1 1 1 1 0
## 5 1 0 1 1 1 1 0 0
## 5 1 0 1 1 1 0 1 0
## 3 1 0 1 1 1 0 0 0
## 2 1 0 1 1 0 0 1 0
## 1 1 0 0 0 1 0 1 0
## 0 21 358 1423 2273 40256 49949 98569
##
## 1043 0
## 25712 1
## 1177 1
## 31197 2
## 513 1
## 22467 2
## 303 2
## 15641 3
## 42 1
## 681 2
## 42 2
## 895 3
## 33 2
## 271 3
## 18 3
## 209 4
## 5 1
## 268 2
## 2 2
## 192 3
## 1 2
## 467 3
## 4 3
## 147 4
## 34 3
## 14 4
## 1 3
## 7 4
## 1 4
## 1 5
## 39 2
## 29 3
## 1 2
## 2 3
## 1 3
## 1 4
## 6 3
## 3 2
## 65 3
## 39 4
## 3 3
## 127 4
## 3 4
## 22 5
## 1 3
## 11 4
## 3 5
## 1 5
## 5 2
## 5 3
## 5 3
## 3 4
## 2 4
## 1 5
## 192849
#3:First Drop
#removing the most missed variables and irrelevant variables(encounter id,patient nbr,weight and payer_code(insurance) and medical speciality, and )
diabetic_largedata<-diabetic_largedata[,-c(1,2,6,11,12)]
str(diabetic_largedata)
## 'data.frame': 101766 obs. of 45 variables:
## $ race : Factor w/ 5 levels "AfricanAmerican",..: 3 3 1 3 3 3 3 3 3 3 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 1 1 1 2 2 2 2 2 1 1 ...
## $ age : Factor w/ 10 levels "[0-10)","[10-20)",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ admission_type_id : int 6 1 1 1 1 2 3 1 2 3 ...
## $ discharge_disposition_id: int 25 1 1 1 1 1 1 1 1 3 ...
## $ admission_source_id : int 1 7 7 7 7 2 2 7 4 4 ...
## $ time_in_hospital : int 1 3 2 2 1 3 4 5 13 12 ...
## $ num_lab_procedures : int 41 59 11 44 51 31 70 73 68 33 ...
## $ num_procedures : int 0 0 5 1 0 6 1 0 2 3 ...
## $ num_medications : int 1 18 13 16 8 16 21 12 28 18 ...
## $ number_outpatient : int 0 0 2 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 1 0 0 0 0 0 0 0 ...
## $ diag_1 : Factor w/ 716 levels "10","11","110",..: 125 144 455 555 55 264 264 277 253 283 ...
## $ diag_2 : Factor w/ 748 levels "11","110","111",..: NA 80 79 98 25 247 247 315 261 47 ...
## $ diag_3 : Factor w/ 789 levels "11","110","111",..: NA 122 767 249 87 87 771 87 230 318 ...
## $ number_diagnoses : int 1 9 6 7 5 9 7 8 8 8 ...
## $ max_glu_serum : Factor w/ 4 levels ">200",">300",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ A1Cresult : Factor w/ 4 levels ">7",">8","None",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ metformin : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
## $ repaglinide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ nateglinide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ chlorpropamide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ glimepiride : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
## $ acetohexamide : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ glipizide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 3 2 3 2 2 2 3 2 ...
## $ glyburide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 3 2 2 ...
## $ tolbutamide : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ pioglitazone : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ rosiglitazone : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 3 ...
## $ acarbose : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ miglitol : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ troglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ tolazamide : Factor w/ 3 levels "No","Steady",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ examide : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
## $ citoglipton : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
## $ insulin : Factor w/ 4 levels "Down","No","Steady",..: 2 4 2 4 3 3 3 2 3 3 ...
## $ glyburide.metformin : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ glipizide.metformin : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ glimepiride.pioglitazone: Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ metformin.rosiglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ metformin.pioglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ change : Factor w/ 2 levels "Ch","No": 2 1 2 1 1 2 1 2 1 1 ...
## $ diabetesMed : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 2 2 2 ...
## $ readmitted : Factor w/ 3 levels "<30",">30","NO": 3 2 3 3 3 2 3 2 3 3 ...
#just check if we have any question mark
plot(sapply(diabetic_largedata,function(x)sum(grepl("\\?",x))))
plot(sapply(diabetic_largedata,function(x)sum(grepl(" Unknown/Invalid",x))))
#many of them are diabetes medication mybe we can group all of them together (group all the medication together) or we can just keep diabetesMed
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(diabetic_largedata,main="NA's Precentage",col=c("black","pink"))
##4:Groupimg the Outcome variable
#here we have 3 classes for outcome varible(readmited) make it to 2 classes
table(diabetic_largedata$readmitted)
##
## <30 >30 NO
## 11357 35545 54864
#the patient who come after 30 days it is considered as "YES" if he come back before 30 days the hospital should pay all(reimbersment) so they want to predict the ones that are ptential to come bac to prevent than and don't discharge them soon !
diabetic_largedata$readmitted<-ifelse(diabetic_largedata$readmitted==">30"|diabetic_largedata$readmitted=="<30","YES","NO")
table(diabetic_largedata$readmitted)
##
## NO YES
## 54864 46902
table(diabetic_largedata$A1Cresult)
##
## >7 >8 None Norm
## 3812 8216 84748 4990
table(diabetic_largedata$max_glu_serum)
##
## >200 >300 None Norm
## 1485 1264 96420 2597
diabetic_largedata<-subset(diabetic_largedata,max_glu_serum!="None",)
diabetic_largedata<-subset(diabetic_largedata,A1Cresult!="None",)
diabetic_largedata<-droplevels(diabetic_largedata)
plot(table(diabetic_largedata$A1Cresult))
plot(table(diabetic_largedata$max_glu_serum))
nrow(diabetic_largedata)
## [1] 298
##Checking imbalanceness of Outcome
#install.packages("lessR")
library(lessR)
##
## lessR 4.2.8 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read text, Excel, SPSS, SAS, or R data file
## d is default data frame, data= in analysis routines optional
##
## Learn about reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables.
## Enter: browseVignettes("lessR")
##
## View changes in this and recent versions of lessR.
## Enter: news(package="lessR")
##
## Interactive data analysis.
## Enter: interact()
PieChart(data=diabetic_largedata,readmitted, fill = c("orange", "blue"), main = "Class distributionoof readmitted",values = "%")
## >>> suggestions
## PieChart(readmitted, hole=0) # traditional pie chart
## PieChart(readmitted, values="%") # display %'s on the chart
## PieChart(readmitted) # bar chart
## Plot(readmitted) # bubble plot
## Plot(readmitted, values="count") # lollipop plot
##
## --- readmitted ---
##
## NO YES Total
## Frequencies: 123 175 298
## Proportions: 0.413 0.587 1.000
##
## Chi-squared test of null hypothesis of equal probabilities
## Chisq = 9.074, df = 1, p-value = 0.003
#Second drop
#install.packages("Amelia")
library(naniar)
missmap(diabetic_largedata)
gg_miss_var(diabetic_largedata)
gg_miss_upset(diabetic_largedata)
#around 2% of the data can be omitted
#dropping the rows that contain NAs
diabetic_largedata<-na.omit(diabetic_largedata)
nrow(diabetic_largedata)
## [1] 289
plot_missing(diabetic_largedata)
#Checking correlations of categorical
#using polycor for getting the correlation of some variables with categorical variables
library(polycor)
#correlation of admission_type_id,discharge_disposition_id,admission_source_id with the outcome
correlationtable <- data.frame(
variable = c("admission_source_id", "discharge_disposition_id", "admission_type_id","diag1","diag2","diag3"),
readmitted = c(
polychor(diabetic_largedata$admission_source_id, diabetic_largedata$readmitted),
polychor(diabetic_largedata$discharge_disposition_id, diabetic_largedata$readmitted),
polychor(diabetic_largedata$admission_type_id, diabetic_largedata$readmitted),
polychor(diabetic_largedata$diag_1,diabetic_largedata$readmitted),
polychor(diabetic_largedata$diag_2,diabetic_largedata$readmitted),
polychor(diabetic_largedata$diag_3,diabetic_largedata$readmitted)
),
age=c(polychor(diabetic_largedata$admission_source_id,diabetic_largedata$age),
polychor(diabetic_largedata$discharge_disposition_id,diabetic_largedata$age),
polychor(diabetic_largedata$admission_type_id,diabetic_largedata$age),
polychor(diabetic_largedata$diag_1,diabetic_largedata$age),
polychor(diabetic_largedata$diag_2,diabetic_largedata$age),
polychor(diabetic_largedata$diag_3,diabetic_largedata$age)
)
)
correlationtable
## variable readmitted age
## 1 admission_source_id -0.23278752 -0.033934736
## 2 discharge_disposition_id 0.05802495 0.237600899
## 3 admission_type_id -0.17096102 0.209122762
## 4 diag1 -0.04364375 0.005544923
## 5 diag2 -0.01033573 0.017288829
## 6 diag3 0.07194527 0.002855034
#Checking corelation of Numeric variables
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lessR':
##
## recode, rename
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
numdiab = c()
for (i in 1:ncol(diabetic_largedata)) {
if (is.numeric(diabetic_largedata[,i])){
numdiab = append(numdiab, i)
}
}
numdiab<-diabetic_largedata[,numdiab]
numdiab<-cbind(numdiab,as.numeric(diabetic_largedata$diag_1),as.numeric(diabetic_largedata$diag_2),as.numeric(diabetic_largedata$diag_3))
head(numdiab)
## admission_type_id discharge_disposition_id admission_source_id
## 163 6 3 7
## 461 6 1 7
## 594 6 1 7
## 697 6 6 7
## 772 6 1 2
## 824 6 1 7
## time_in_hospital num_lab_procedures num_procedures num_medications
## 163 5 47 1 6
## 461 10 72 1 19
## 594 2 61 0 5
## 697 11 71 1 20
## 772 14 43 0 11
## 824 7 105 3 16
## number_outpatient number_emergency number_inpatient number_diagnoses
## 163 0 0 0 5
## 461 0 0 0 5
## 594 0 0 0 5
## 697 0 0 0 5
## 772 0 0 0 3
## 824 0 0 0 5
## as.numeric(diabetic_largedata$diag_1) as.numeric(diabetic_largedata$diag_2)
## 163 24 28
## 461 5 23
## 594 19 89
## 697 87 9
## 772 22 7
## 824 36 9
## as.numeric(diabetic_largedata$diag_3)
## 163 46
## 461 28
## 594 9
## 697 100
## 772 66
## 824 21
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(numdiab),type='upper',tl.srt = 30,method="number",tl.cex = 0.6,bg="gray",title = "Correlation of Numeric Variables")
#admistion type id is highly corelated with some variables but since it is id it should be categorical to prevent over shadow of bigger numbers so I made another data set of literraly numeric varibles
numdiab2<-diabetic_largedata[, c("time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications", "number_diagnoses", "number_outpatient", "number_inpatient","number_emergency")]
corrplot(cor(numdiab2),type='upper',method = 'pie')
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
numdiab2$readmitted<-diabetic_largedata$readmitted
ggpairs(numdiab2, aes(alpha=0.1, color = readmitted))+ theme(plot.title = element_text(size = 16),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
strip.text = element_text(size = 12),
panel.spacing = unit(0.2, "lines"),
panel.background = element_blank(),
panel.border = element_rect(color = "black", fill = NA, size = 0.1),
plot.margin = unit(c(1,1,1,1), "cm"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cbind(colnames(diabetic_largedata))
## [,1]
## [1,] "race"
## [2,] "gender"
## [3,] "age"
## [4,] "admission_type_id"
## [5,] "discharge_disposition_id"
## [6,] "admission_source_id"
## [7,] "time_in_hospital"
## [8,] "num_lab_procedures"
## [9,] "num_procedures"
## [10,] "num_medications"
## [11,] "number_outpatient"
## [12,] "number_emergency"
## [13,] "number_inpatient"
## [14,] "diag_1"
## [15,] "diag_2"
## [16,] "diag_3"
## [17,] "number_diagnoses"
## [18,] "max_glu_serum"
## [19,] "A1Cresult"
## [20,] "metformin"
## [21,] "repaglinide"
## [22,] "nateglinide"
## [23,] "chlorpropamide"
## [24,] "glimepiride"
## [25,] "acetohexamide"
## [26,] "glipizide"
## [27,] "glyburide"
## [28,] "tolbutamide"
## [29,] "pioglitazone"
## [30,] "rosiglitazone"
## [31,] "acarbose"
## [32,] "miglitol"
## [33,] "troglitazone"
## [34,] "tolazamide"
## [35,] "examide"
## [36,] "citoglipton"
## [37,] "insulin"
## [38,] "glyburide.metformin"
## [39,] "glipizide.metformin"
## [40,] "glimepiride.pioglitazone"
## [41,] "metformin.rosiglitazone"
## [42,] "metformin.pioglitazone"
## [43,] "change"
## [44,] "diabetesMed"
## [45,] "readmitted"
#Diabetics medications = 20:42
summary(diabetic_largedata[20:42])
## metformin repaglinide nateglinide chlorpropamide glimepiride
## Down : 2 No :278 No :288 No :288 No :282
## No :245 Steady: 10 Steady: 1 Steady: 1 Steady: 6
## Steady: 36 Up : 1 Up : 1
## Up : 6
## acetohexamide glipizide glyburide tolbutamide pioglitazone rosiglitazone
## No:289 Down : 2 No :274 No:289 Down : 1 No :279
## No :243 Steady: 14 No :276 Steady: 8
## Steady: 38 Up : 1 Steady: 12 Up : 2
## Up : 6
## acarbose miglitol troglitazone tolazamide examide citoglipton
## No :287 No:289 No:289 No:289 No:289 No:289
## Steady: 1
## Up : 1
##
## insulin glyburide.metformin glipizide.metformin glimepiride.pioglitazone
## Down : 20 No :288 No:289 No:289
## No :202 Steady: 1
## Steady: 53
## Up : 14
## metformin.rosiglitazone metformin.pioglitazone
## No:289 No:289
##
##
##
#Third Drop : Medications
diabetic_largedata<-diabetic_largedata[, -c(20:36,38:42)]
head(diabetic_largedata)
## race gender age admission_type_id discharge_disposition_id
## 163 Caucasian Male [80-90) 6 3
## 461 AfricanAmerican Female [70-80) 6 1
## 594 Caucasian Female [50-60) 6 1
## 697 Other Male [70-80) 6 6
## 772 Caucasian Female [30-40) 6 1
## 824 Caucasian Male [80-90) 6 1
## admission_source_id time_in_hospital num_lab_procedures num_procedures
## 163 7 5 47 1
## 461 7 10 72 1
## 594 7 2 61 0
## 697 7 11 71 1
## 772 2 14 43 0
## 824 7 7 105 3
## num_medications number_outpatient number_emergency number_inpatient diag_1
## 163 6 0 0 0 332
## 461 19 0 0 0 250.02
## 594 5 0 0 0 276
## 697 20 0 0 0 820
## 772 11 0 0 0 296
## 824 16 0 0 0 428
## diag_2 diag_3 number_diagnoses max_glu_serum A1Cresult insulin change
## 163 294 425 5 >200 Norm No No
## 461 276 294 5 >300 >8 Up Ch
## 594 780 250.03 5 >300 >8 Steady No
## 697 250.02 E885 5 >200 >7 No No
## 772 250 564 3 Norm >7 No No
## 824 250.02 276 5 >300 >7 No No
## diabetesMed readmitted
## 163 No YES
## 461 Yes YES
## 594 Yes NO
## 697 Yes NO
## 772 No YES
## 824 Yes YES
summary(diabetic_largedata)
## race gender age admission_type_id
## AfricanAmerican: 51 Female:168 [50-60):65 Min. :1.000
## Asian : 7 Male :121 [60-70):60 1st Qu.:6.000
## Caucasian :180 [70-80):60 Median :6.000
## Hispanic : 38 [80-90):41 Mean :5.024
## Other : 13 [40-50):38 3rd Qu.:6.000
## [30-40):17 Max. :6.000
## (Other): 8
## discharge_disposition_id admission_source_id time_in_hospital
## Min. : 1.000 Min. :1.000 Min. : 1.000
## 1st Qu.: 1.000 1st Qu.:7.000 1st Qu.: 3.000
## Median : 1.000 Median :7.000 Median : 5.000
## Mean : 2.197 Mean :6.488 Mean : 5.398
## 3rd Qu.: 3.000 3rd Qu.:7.000 3rd Qu.: 7.000
## Max. :13.000 Max. :7.000 Max. :14.000
##
## num_lab_procedures num_procedures num_medications number_outpatient
## Min. : 31.0 Min. :0.0000 Min. : 1.00 Min. :0.0000
## 1st Qu.: 54.0 1st Qu.:0.0000 1st Qu.: 9.00 1st Qu.:0.0000
## Median : 63.0 Median :0.0000 Median :14.00 Median :0.0000
## Mean : 64.2 Mean :0.8443 Mean :14.54 Mean :0.1592
## 3rd Qu.: 74.0 3rd Qu.:1.0000 3rd Qu.:19.00 3rd Qu.:0.0000
## Max. :106.0 Max. :6.0000 Max. :35.00 Max. :6.0000
##
## number_emergency number_inpatient diag_1 diag_2 diag_3
## Min. :0.000 Min. :0.0000 491 : 21 250 : 22 250 : 24
## 1st Qu.:0.000 1st Qu.:0.0000 428 : 19 250.02 : 19 401 : 20
## Median :0.000 Median :0.0000 682 : 19 276 : 17 276 : 16
## Mean :0.173 Mean :0.6678 414 : 14 411 : 11 250.02 : 15
## 3rd Qu.:0.000 3rd Qu.:1.0000 786 : 12 428 : 10 414 : 13
## Max. :9.000 Max. :9.0000 250.02 : 11 496 : 10 272 : 11
## (Other):193 (Other):200 (Other):190
## number_diagnoses max_glu_serum A1Cresult insulin change diabetesMed
## Min. :3.000 >200: 69 >7 : 63 Down : 20 Ch: 84 No :118
## 1st Qu.:5.000 >300:124 >8 :171 No :202 No:205 Yes:171
## Median :6.000 Norm: 96 Norm: 55 Steady: 53
## Mean :5.958 Up : 14
## 3rd Qu.:6.000
## Max. :9.000
##
## readmitted
## Length:289
## Class :character
## Mode :character
##
##
##
##
str(diabetic_largedata)
## 'data.frame': 289 obs. of 23 variables:
## $ race : Factor w/ 5 levels "AfricanAmerican",..: 3 1 3 5 3 3 4 3 3 3 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 1 1 2 1 2 1 1 1 2 ...
## $ age : Factor w/ 8 levels "[10-20)","[20-30)",..: 8 7 5 7 3 8 5 4 5 5 ...
## $ admission_type_id : int 6 6 6 6 6 6 6 6 6 6 ...
## $ discharge_disposition_id: int 3 1 1 6 1 1 1 1 1 10 ...
## $ admission_source_id : int 7 7 7 7 2 7 7 7 7 1 ...
## $ time_in_hospital : int 5 10 2 11 14 7 2 3 2 4 ...
## $ num_lab_procedures : int 47 72 61 71 43 105 66 76 43 41 ...
## $ num_procedures : int 1 1 0 1 0 3 0 0 0 1 ...
## $ num_medications : int 6 19 5 20 11 16 3 9 13 8 ...
## $ number_outpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_1 : Factor w/ 92 levels "112","162","188",..: 24 5 19 87 22 36 6 21 73 76 ...
## $ diag_2 : Factor w/ 104 levels "162","174","197",..: 28 23 89 9 7 9 42 23 10 66 ...
## $ diag_3 : Factor w/ 104 levels "198","208","211",..: 46 28 9 100 66 21 19 31 93 6 ...
## $ number_diagnoses : int 5 5 5 5 3 5 3 5 5 3 ...
## $ max_glu_serum : Factor w/ 3 levels ">200",">300",..: 1 2 2 1 3 2 3 2 2 1 ...
## $ A1Cresult : Factor w/ 3 levels ">7",">8","Norm": 3 2 2 1 1 1 1 1 1 2 ...
## $ insulin : Factor w/ 4 levels "Down","No","Steady",..: 2 4 3 2 2 2 2 2 2 2 ...
## $ change : Factor w/ 2 levels "Ch","No": 2 1 2 2 2 2 2 1 2 2 ...
## $ diabetesMed : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 1 2 ...
## $ readmitted : chr "YES" "YES" "NO" "NO" ...
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:polycor':
##
## polyserial
## The following objects are masked from 'package:lessR':
##
## reflect, rescale, scree, skew
data.frame(describe.by(diabetic_largedata))
## vars n mean sd median trimmed
## race* 1 289 2.8442907 1.0103511 3 2.8712446
## gender* 2 289 1.4186851 0.4941993 1 1.3991416
## age* 3 289 5.7024221 1.5947051 6 5.7896996
## admission_type_id 4 289 5.0242215 1.9603982 6 5.3905579
## discharge_disposition_id 5 289 2.1972318 2.1584442 1 1.7339056
## admission_source_id 6 289 6.4878893 1.6689647 7 7.0000000
## time_in_hospital 7 289 5.3979239 3.0477899 5 5.1158798
## num_lab_procedures 8 289 64.2006920 14.4842299 63 63.6523605
## num_procedures 9 289 0.8442907 1.2388294 0 0.6480687
## num_medications 10 289 14.5397924 7.3238083 14 14.0858369
## number_outpatient 11 289 0.1591696 0.6736050 0 0.0000000
## number_emergency 12 289 0.1730104 0.9267429 0 0.0000000
## number_inpatient 13 289 0.6678201 1.3018559 0 0.3690987
## diag_1* 14 289 43.4359862 24.1849834 39 43.0386266
## diag_2* 15 289 45.3183391 28.9008550 46 44.1072961
## diag_3* 16 289 42.2249135 28.5040434 38 40.4291845
## number_diagnoses 17 289 5.9584775 1.5224041 6 5.9055794
## max_glu_serum* 18 289 2.0934256 0.7511045 2 2.1158798
## A1Cresult* 19 289 1.9723183 0.6394946 2 1.9656652
## insulin* 20 289 2.2110727 0.6349884 2 2.1673820
## change* 21 289 1.7093426 0.4548534 2 1.7596567
## diabetesMed* 22 289 1.5916955 0.4923726 2 1.6137339
## readmitted* 23 289 1.5986159 0.4910287 2 1.6223176
## mad min max range skew kurtosis
## race* 0.0000 1 5 4 -0.45149674 0.12878562
## gender* 0.0000 1 2 1 0.32793857 -1.89899132
## age* 1.4826 1 8 7 -0.45607934 -0.18448163
## admission_type_id 0.0000 1 6 5 -1.51625238 0.33020332
## discharge_disposition_id 0.0000 1 13 12 2.08689810 4.41083501
## admission_source_id 0.0000 1 7 6 -2.93763596 6.67378573
## time_in_hospital 2.9652 1 14 13 0.79335910 0.09585215
## num_lab_procedures 14.8260 31 106 75 0.33938332 -0.22361907
## num_procedures 0.0000 0 6 6 1.30048520 0.79265021
## num_medications 7.4130 1 35 34 0.57284441 0.02853114
## number_outpatient 0.0000 0 6 6 5.50694384 34.51658080
## number_emergency 0.0000 0 9 9 6.77369511 49.50648338
## number_inpatient 0.0000 0 9 9 2.84011121 10.32269040
## diag_1* 28.1694 1 92 91 0.13025342 -0.97016096
## diag_2* 34.0998 1 104 103 0.20413932 -1.14550741
## diag_3* 31.1346 1 104 103 0.43363785 -0.96011570
## number_diagnoses 1.4826 3 9 6 0.54638627 0.32225443
## max_glu_serum* 1.4826 1 3 2 -0.15330834 -1.22262668
## A1Cresult* 0.0000 1 3 2 0.02364437 -0.56572540
## insulin* 0.0000 1 4 3 0.92913449 1.41287848
## change* 0.0000 1 2 1 -0.91729867 -1.16254993
## diabetesMed* 0.0000 1 2 1 -0.37117505 -1.86865915
## readmitted* 0.0000 1 2 1 -0.40028082 -1.84612737
## se
## race* 0.05943242
## gender* 0.02907055
## age* 0.09380618
## admission_type_id 0.11531754
## discharge_disposition_id 0.12696730
## admission_source_id 0.09817439
## time_in_hospital 0.17928176
## num_lab_procedures 0.85201352
## num_procedures 0.07287232
## num_medications 0.43081225
## number_outpatient 0.03962383
## number_emergency 0.05451429
## number_inpatient 0.07657976
## diag_1* 1.42264608
## diag_2* 1.70005029
## diag_3* 1.67670843
## number_diagnoses 0.08955318
## max_glu_serum* 0.04418262
## A1Cresult* 0.03761733
## insulin* 0.03735226
## change* 0.02675608
## diabetesMed* 0.02896310
## readmitted* 0.02888404
sapply(diabetic_largedata,function(x)sum(is.na(x)))
## race gender age
## 0 0 0
## admission_type_id discharge_disposition_id admission_source_id
## 0 0 0
## time_in_hospital num_lab_procedures num_procedures
## 0 0 0
## num_medications number_outpatient number_emergency
## 0 0 0
## number_inpatient diag_1 diag_2
## 0 0 0
## diag_3 number_diagnoses max_glu_serum
## 0 0 0
## A1Cresult insulin change
## 0 0 0
## diabetesMed readmitted
## 0 0
#gender age race
table(diabetic_largedata$race,diabetic_largedata$age)
##
## [10-20) [20-30) [30-40) [40-50) [50-60) [60-70) [70-80)
## AfricanAmerican 1 2 4 10 14 8 8
## Asian 0 0 0 1 3 2 1
## Caucasian 3 2 9 19 37 30 45
## Hispanic 0 0 2 8 10 13 3
## Other 0 0 2 0 1 7 3
##
## [80-90)
## AfricanAmerican 4
## Asian 0
## Caucasian 35
## Hispanic 2
## Other 0
ggplot(diabetic_largedata, aes(readmitted)) +
geom_bar(aes(age, fill = factor(readmitted)), position = 'stack')+
scale_fill_manual(values = c("#bff5cc", "#009f71"))+
ylab("Readmission")+
xlab("age")+
ggtitle("Age Vs Readmission")
ggplot(diabetic_largedata, aes(readmitted)) +
geom_bar(aes(race, fill = factor(readmitted)), position = 'stack')+
scale_fill_manual(values = c("#bea9de", "#895ae8"))+
ylab("Readmission")+
xlab("Race")+
ggtitle("Race Vs Readmission")
lapply(diabetic_largedata,function(x)table(x))
## $race
## x
## AfricanAmerican Asian Caucasian Hispanic Other
## 51 7 180 38 13
##
## $gender
## x
## Female Male
## 168 121
##
## $age
## x
## [10-20) [20-30) [30-40) [40-50) [50-60) [60-70) [70-80) [80-90)
## 4 4 17 38 65 60 60 41
##
## $admission_type_id
## x
## 1 2 3 6
## 52 4 2 231
##
## $discharge_disposition_id
## x
## 1 2 3 5 6 7 10 11 13
## 190 20 33 5 27 9 1 3 1
##
## $admission_source_id
## x
## 1 2 7
## 23 2 264
##
## $time_in_hospital
## x
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 16 35 33 51 34 34 22 15 15 14 5 7 3 5
##
## $num_lab_procedures
## x
## 31 32 36 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 1 1 1 2 2 2 4 2 6 2 4 5 2 4 9 5 7 7 6 8
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
## 6 8 8 10 5 7 7 6 9 6 9 5 8 8 7 6 6 9 4 5
## 75 76 77 78 79 80 81 82 83 84 85 86 87 88 90 91 93 94 95 96
## 6 5 8 4 6 5 4 2 2 4 2 1 2 3 1 2 2 1 2 2
## 97 98 102 105 106
## 1 2 1 1 1
##
## $num_procedures
## x
## 0 1 2 3 4 5 6
## 175 42 23 44 2 2 1
##
## $num_medications
## x
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 2 3 10 5 8 14 7 11 15 9 24 14 21 16 10 21 13 4 19 10 8 5 4 3 6 7
## 27 28 29 30 31 32 33 34 35
## 3 4 2 1 1 2 1 4 2
##
## $number_outpatient
## x
## 0 1 2 3 4 5 6
## 266 13 3 4 1 1 1
##
## $number_emergency
## x
## 0 1 2 3 5 6 7 9
## 272 8 3 1 1 2 1 1
##
## $number_inpatient
## x
## 0 1 2 3 4 5 6 7 9
## 197 42 25 15 4 1 2 2 1
##
## $diag_1
## x
## 112 162 188 250 250.02 250.03 250.1 250.11 250.12 250.13 250.22
## 1 2 1 1 11 3 2 2 4 7 2
## 250.6 250.7 250.8 250.81 250.82 250.83 253 276 280 295 296
## 6 1 7 2 3 1 1 2 1 1 2
## 298 332 340 376 38 401 402 403 410 411 414
## 1 2 1 1 8 4 1 1 5 1 14
## 415 427 428 433 434 435 436 437 443 444 451
## 1 4 19 2 9 8 1 1 2 1 1
## 453 458 486 491 493 507 515 518 53 531 535
## 3 1 9 21 9 1 1 2 1 2 2
## 537 542 558 560 562 564 566 569 571 574 577
## 1 0 3 1 3 1 0 2 1 2 4
## 578 584 590 596 599 681 682 707 714 715 722
## 2 2 4 1 4 1 19 2 1 2 2
## 730 733 780 784 785 786 79 8 807 820 965
## 1 1 1 1 1 12 1 1 2 1 1
## 969 996 V57 V58
## 1 2 1 1
##
## $diag_2
## x
## 162 174 197 211 218 244 250 250.01 250.02 250.03 250.11
## 1 1 1 1 1 1 22 7 19 3 1
## 250.12 250.13 250.4 250.42 250.43 250.6 250.7 250.8 250.82 250.83 272
## 1 1 2 1 1 5 1 1 3 2 3
## 276 278 280 285 286 294 295 300 305 327 331
## 17 1 1 2 2 5 1 1 1 2 1
## 337 348 349 357 358 38 382 386 401 402 403
## 1 1 1 2 1 3 1 1 9 2 5
## 41 410 411 413 414 415 424 425 427 428 433
## 4 2 11 2 7 1 1 4 4 10 2
## 435 437 441 443 455 466 473 482 486 491 493
## 1 1 1 2 1 1 1 2 3 7 2
## 496 511 515 518 530 532 535 536 569 571 577
## 10 2 2 2 1 1 2 1 1 1 1
## 584 585 590 599 607 682 707 716 724 730 758
## 2 1 1 10 1 3 9 1 1 1 1
## 780 781 785 786 787 789 790 792 799 8 995
## 3 2 1 2 1 1 1 1 1 1 6
## 998 E888 E906 E980 V58
## 0 1 1 1 1
##
## $diag_3
## x
## 198 208 211 216 238 250 250.01 250.02 250.03 250.12 250.4
## 1 1 1 1 1 24 3 15 6 1 1
## 250.41 250.42 250.43 250.5 250.53 250.6 250.8 272 275 276 278
## 1 1 1 1 1 4 3 11 1 16 4
## 280 285 287 288 293 294 295 296 300 303 327
## 4 4 1 3 1 2 2 1 1 2 2
## 332 345 357 381 401 402 403 41 413 414 416
## 1 1 3 1 20 1 3 4 1 13 1
## 424 425 426 427 428 433 435 443 446 453 458
## 2 5 1 5 9 2 1 2 0 1 1
## 466 486 493 496 5 511 518 530 535 536 564
## 1 2 2 3 1 1 4 2 3 1 1
## 571 572 574 575 581 583 584 585 592 593 599
## 1 1 1 1 2 1 2 3 1 1 7
## 625 681 682 707 724 733 737 780 783 785 786
## 1 1 5 5 1 2 1 4 1 4 1
## 790 794 799 826 891 920 945 962 995 E849 E880
## 1 2 2 1 1 1 1 1 3 1 1
## E885 E932 E950 V12 V58
## 1 1 1 1 2
##
## $number_diagnoses
## x
## 3 4 5 6 7 8 9
## 18 13 67 142 3 6 40
##
## $max_glu_serum
## x
## >200 >300 Norm
## 69 124 96
##
## $A1Cresult
## x
## >7 >8 Norm
## 63 171 55
##
## $insulin
## x
## Down No Steady Up
## 20 202 53 14
##
## $change
## x
## Ch No
## 84 205
##
## $diabetesMed
## x
## No Yes
## 118 171
##
## $readmitted
## x
## NO YES
## 116 173
###Data Visualization
#when you have categorical and continues column you can come up with box plot
library(ggplot2)
ggplot(diabetic_largedata, aes_string(x="readmitted",y=diabetic_largedata$num_lab_procedures)) + geom_boxplot(aes(fill=readmitted))+ggtitle("num_lab_procedures grouped by readmitted")
library(ggbeeswarm)
ggplot(diabetic_largedata, aes_string(x="readmitted",y=diabetic_largedata$num_lab_procedures)) + geom_boxplot(aes(fill=readmitted,color=readmitted)) + geom_quasirandom(alpha = 0.3)
#Ploting boxplot for all continues variable
continuous_vars <- names(diabetic_largedata)[sapply(diabetic_largedata, is.numeric)]
plots <- list()
for (var in continuous_vars) {
p<-ggplot(diabetic_largedata, aes(x = factor(readmitted), y = .data[[var]], fill = factor(readmitted))) +
geom_boxplot() +
labs(title = paste("Boxplot of", var)) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
plots[[var]] <- p
}
print(plots)
## $admission_type_id
##
## $discharge_disposition_id
##
## $admission_source_id
##
## $time_in_hospital
##
## $num_lab_procedures
##
## $num_procedures
##
## $num_medications
##
## $number_outpatient
##
## $number_emergency
##
## $number_inpatient
##
## $number_diagnoses
continuous_vars <- names(diabetic_largedata)[sapply(diabetic_largedata, is.numeric)]
# create a long format dataset with continuous_vars and readmitted
df <- reshape2::melt(diabetic_largedata[, c(continuous_vars, "readmitted")], id.vars = "readmitted")
# create boxplots with outcome variable as fill color and facet by continuous variable
p <- ggplot(df, aes(x = factor(readmitted), y = value, fill = factor(readmitted))) +
geom_boxplot() +
facet_wrap(~variable, scales = "free_y") +
theme_bw()
print(p)
vars <- names(diabetic_largedata)
plots <- list()
for (var in vars) {
if(is.numeric(diabetic_largedata[[var]])) {
q1<-quantile(diabetic_largedata[[var]], 0.25)
q3<-quantile(diabetic_largedata[[var]], 0.75)
p <- ggplot(diabetic_largedata, aes(x = .data[[var]])) +
geom_histogram(binwidth = 0.3, col = "black", fill = "#8db700") +
geom_vline(xintercept = q1, col = "red", lwd = 2) +
geom_vline(xintercept = q3, col = "red", lwd = 2) +
labs(title = paste("Histogram of", var)) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
plots[[var]]<-p
}else if (is.factor(diabetic_largedata[[var]])) {
p<- ggplot(diabetic_largedata, aes(x = .data[[var]], fill = readmitted)) +
geom_bar(position = "dodge") +
labs(title = paste("Barplot of", var)) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
plots[[var]] <- p
}
}
print(plots)
## $race
##
## $gender
##
## $age
##
## $admission_type_id
##
## $discharge_disposition_id
##
## $admission_source_id
##
## $time_in_hospital
##
## $num_lab_procedures
##
## $num_procedures
##
## $num_medications
##
## $number_outpatient
##
## $number_emergency
##
## $number_inpatient
##
## $diag_1
##
## $diag_2
##
## $diag_3
##
## $number_diagnoses
##
## $max_glu_serum
##
## $A1Cresult
##
## $insulin
##
## $change
##
## $diabetesMed
vars <- names(diabetic_largedata)
plots <- list()
for (i in 1:(length(vars)-1)) {
for (j in (i+1):length(vars)) {
if(is.numeric(diabetic_largedata[[vars[i]]]) && is.numeric(diabetic_largedata[[vars[j]]])) {
p <- ggplot(diabetic_largedata, aes(x = .data[[vars[i]]], y = .data[[vars[j]]], color = factor(readmitted))) +
geom_point() +
labs(title = paste("Scatterplot of", vars[i], "vs.", vars[j])) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
plots[[paste(vars[i], vars[j], sep="_")]] <- p
}
}
}
print(plots)
## $admission_type_id_discharge_disposition_id
##
## $admission_type_id_admission_source_id
##
## $admission_type_id_time_in_hospital
##
## $admission_type_id_num_lab_procedures
##
## $admission_type_id_num_procedures
##
## $admission_type_id_num_medications
##
## $admission_type_id_number_outpatient
##
## $admission_type_id_number_emergency
##
## $admission_type_id_number_inpatient
##
## $admission_type_id_number_diagnoses
##
## $discharge_disposition_id_admission_source_id
##
## $discharge_disposition_id_time_in_hospital
##
## $discharge_disposition_id_num_lab_procedures
##
## $discharge_disposition_id_num_procedures
##
## $discharge_disposition_id_num_medications
##
## $discharge_disposition_id_number_outpatient
##
## $discharge_disposition_id_number_emergency
##
## $discharge_disposition_id_number_inpatient
##
## $discharge_disposition_id_number_diagnoses
##
## $admission_source_id_time_in_hospital
##
## $admission_source_id_num_lab_procedures
##
## $admission_source_id_num_procedures
##
## $admission_source_id_num_medications
##
## $admission_source_id_number_outpatient
##
## $admission_source_id_number_emergency
##
## $admission_source_id_number_inpatient
##
## $admission_source_id_number_diagnoses
##
## $time_in_hospital_num_lab_procedures
##
## $time_in_hospital_num_procedures
##
## $time_in_hospital_num_medications
##
## $time_in_hospital_number_outpatient
##
## $time_in_hospital_number_emergency
##
## $time_in_hospital_number_inpatient
##
## $time_in_hospital_number_diagnoses
##
## $num_lab_procedures_num_procedures
##
## $num_lab_procedures_num_medications
##
## $num_lab_procedures_number_outpatient
##
## $num_lab_procedures_number_emergency
##
## $num_lab_procedures_number_inpatient
##
## $num_lab_procedures_number_diagnoses
##
## $num_procedures_num_medications
##
## $num_procedures_number_outpatient
##
## $num_procedures_number_emergency
##
## $num_procedures_number_inpatient
##
## $num_procedures_number_diagnoses
##
## $num_medications_number_outpatient
##
## $num_medications_number_emergency
##
## $num_medications_number_inpatient
##
## $num_medications_number_diagnoses
##
## $number_outpatient_number_emergency
##
## $number_outpatient_number_inpatient
##
## $number_outpatient_number_diagnoses
##
## $number_emergency_number_inpatient
##
## $number_emergency_number_diagnoses
##
## $number_inpatient_number_diagnoses
#Diag bocketting :
str(diabetic_largedata)
## 'data.frame': 289 obs. of 23 variables:
## $ race : Factor w/ 5 levels "AfricanAmerican",..: 3 1 3 5 3 3 4 3 3 3 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 1 1 2 1 2 1 1 1 2 ...
## $ age : Factor w/ 8 levels "[10-20)","[20-30)",..: 8 7 5 7 3 8 5 4 5 5 ...
## $ admission_type_id : int 6 6 6 6 6 6 6 6 6 6 ...
## $ discharge_disposition_id: int 3 1 1 6 1 1 1 1 1 10 ...
## $ admission_source_id : int 7 7 7 7 2 7 7 7 7 1 ...
## $ time_in_hospital : int 5 10 2 11 14 7 2 3 2 4 ...
## $ num_lab_procedures : int 47 72 61 71 43 105 66 76 43 41 ...
## $ num_procedures : int 1 1 0 1 0 3 0 0 0 1 ...
## $ num_medications : int 6 19 5 20 11 16 3 9 13 8 ...
## $ number_outpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_1 : Factor w/ 92 levels "112","162","188",..: 24 5 19 87 22 36 6 21 73 76 ...
## $ diag_2 : Factor w/ 104 levels "162","174","197",..: 28 23 89 9 7 9 42 23 10 66 ...
## $ diag_3 : Factor w/ 104 levels "198","208","211",..: 46 28 9 100 66 21 19 31 93 6 ...
## $ number_diagnoses : int 5 5 5 5 3 5 3 5 5 3 ...
## $ max_glu_serum : Factor w/ 3 levels ">200",">300",..: 1 2 2 1 3 2 3 2 2 1 ...
## $ A1Cresult : Factor w/ 3 levels ">7",">8","Norm": 3 2 2 1 1 1 1 1 1 2 ...
## $ insulin : Factor w/ 4 levels "Down","No","Steady",..: 2 4 3 2 2 2 2 2 2 2 ...
## $ change : Factor w/ 2 levels "Ch","No": 2 1 2 2 2 2 2 1 2 2 ...
## $ diabetesMed : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 1 2 ...
## $ readmitted : chr "YES" "YES" "NO" "NO" ...
# If any of the diagnosis codes falls in the range of 390-459 or is equal to 785, the row is assigned to the "diag_circ" category. If the code falls in the range of 460-519 or is equal to 786, the row is assigned to the "diag_resp" category. If the code falls in the range of 520-579 or is equal to 787, the row is assigned to the "diag_dig" category. If the code falls is greater than 251 but less than 249, the row is assigned to the "diag_diab" category. If the code falls in the range of 290-319 or is equal to 780 or 781, the row is assigned to the "diag_ment" category. If the code falls in the range of 800-999, the row is assigned to the "diag_inj" category. If the code falls in the range of 710-739 or is equal to 736, the row is assigned to the "diag_musc" category. If the code falls in the range of 580-629 or is equal to 788, the row is assigned to the "diag_geni" category. If the code falls in the range of 140-239, the row is assigned to the "diag_neop" category. If the code doesn't fall into any of these ranges, the row is assigned to the "diag_other" category.
diabetic_largedata$diag_circ <- 0
diabetic_largedata$diag_resp <- 0
diabetic_largedata$diag_dig <- 0
diabetic_largedata$diag_diab <- 0
diabetic_largedata$diag_inj <- 0
diabetic_largedata$diag_musc <-0
diabetic_largedata$diag_geni <- 0
diabetic_largedata$diag_neop <-0
diabetic_largedata$diag_other <- 0
# Loop through each row of the data frame
for (i in 1:nrow(diabetic_largedata)) {
# Check each diagnosis code in the row and assign to the appropriate category
for (j in 1:3) {
code <- as.character(diabetic_largedata[i, paste0("diag_", j)])
if (code >= "390" & code <= "459" | code == "785") {
diabetic_largedata[i, "diag_circ"] <- 1
} else if (code > "249" & code < "251") {
diabetic_largedata[i, "diag_diab"] <- 1
} else if (code >= "460" & code <= "519" | code == "786") {
diabetic_largedata[i, "diag_resp"] <- 1
} else if (code >= "520" & code <= "579" | code == "787") {
diabetic_largedata[i, "diag_dig"] <- 1
} else if (code >= "800" & code <= "999") {
diabetic_largedata[i, "diag_inj"] <- 1
} else if (code >= "710" & code <= "739") {
diabetic_largedata[i, "diag_musc"] <- 1
} else if (code >= "580" & code <= "629" | code == "788") {
diabetic_largedata[i, "diag_geni"] <- 1
} else if (code >= "140" & code <= "239" ){
diabetic_largedata[i, "diag_neop"] <- 1
} else (diabetic_largedata$diag_other <- as.numeric(apply(diabetic_largedata[, paste0("diag_", 1:3)], 1, function(x) any(x %in% c("780", "781", "784", paste0(790:799), paste0(240:249), paste0(251:279), paste0(680:709), "782", paste0(1:139), paste0(290:319), paste0(280:289), paste0(320:359), paste0(630:679), paste0(360:389), paste0(740:759), paste0("E", 0:9), paste0("V", 0:9))))))
}
}
str(diabetic_largedata)
## 'data.frame': 289 obs. of 32 variables:
## $ race : Factor w/ 5 levels "AfricanAmerican",..: 3 1 3 5 3 3 4 3 3 3 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 1 1 2 1 2 1 1 1 2 ...
## $ age : Factor w/ 8 levels "[10-20)","[20-30)",..: 8 7 5 7 3 8 5 4 5 5 ...
## $ admission_type_id : int 6 6 6 6 6 6 6 6 6 6 ...
## $ discharge_disposition_id: int 3 1 1 6 1 1 1 1 1 10 ...
## $ admission_source_id : int 7 7 7 7 2 7 7 7 7 1 ...
## $ time_in_hospital : int 5 10 2 11 14 7 2 3 2 4 ...
## $ num_lab_procedures : int 47 72 61 71 43 105 66 76 43 41 ...
## $ num_procedures : int 1 1 0 1 0 3 0 0 0 1 ...
## $ num_medications : int 6 19 5 20 11 16 3 9 13 8 ...
## $ number_outpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_1 : Factor w/ 92 levels "112","162","188",..: 24 5 19 87 22 36 6 21 73 76 ...
## $ diag_2 : Factor w/ 104 levels "162","174","197",..: 28 23 89 9 7 9 42 23 10 66 ...
## $ diag_3 : Factor w/ 104 levels "198","208","211",..: 46 28 9 100 66 21 19 31 93 6 ...
## $ number_diagnoses : int 5 5 5 5 3 5 3 5 5 3 ...
## $ max_glu_serum : Factor w/ 3 levels ">200",">300",..: 1 2 2 1 3 2 3 2 2 1 ...
## $ A1Cresult : Factor w/ 3 levels ">7",">8","Norm": 3 2 2 1 1 1 1 1 1 2 ...
## $ insulin : Factor w/ 4 levels "Down","No","Steady",..: 2 4 3 2 2 2 2 2 2 2 ...
## $ change : Factor w/ 2 levels "Ch","No": 2 1 2 2 2 2 2 1 2 2 ...
## $ diabetesMed : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 1 2 ...
## $ readmitted : chr "YES" "YES" "NO" "NO" ...
## $ diag_circ : num 1 0 0 0 0 1 1 0 0 0 ...
## $ diag_resp : num 0 0 0 0 0 0 0 0 0 1 ...
## $ diag_dig : num 0 0 0 0 1 0 0 0 0 0 ...
## $ diag_diab : num 0 1 1 1 1 1 1 0 1 1 ...
## $ diag_inj : num 0 0 0 1 0 0 0 0 1 0 ...
## $ diag_musc : num 0 0 0 0 0 0 0 0 0 1 ...
## $ diag_geni : num 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_neop : num 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_other : num 1 1 1 0 1 1 1 1 1 0 ...
library(ggplot2)
diag_subset <- diabetic_largedata[, 23:32]
diag_long <- reshape2::melt(diag_subset)
## Using readmitted as id variables
ggplot(diag_long, aes(x = variable, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
stat_summary(aes(label = ..y..), fun = sum, geom = "text", position = position_dodge(width = 0.9), vjust = -0.5) +
labs(x = "Diagnosis", y = "Frequency") +
theme_minimal() +
theme(legend.position = "none")
#Ploting categorical variable #I haven’t add the new diags and remove
the diag1,2,3 from the data set I’ll add later
categorical_vars <- names(diabetic_largedata)[sapply(diabetic_largedata, is.factor)]
plots <- list()
for (var in categorical_vars) {
p <- ggplot(diabetic_largedata, aes(x = factor(readmitted), fill = .data[[var]])) +
geom_bar(position = "fill") +
labs(title = paste("Barplot of", var)) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
plots[[var]] <- p
}
plots
## $race
##
## $gender
##
## $age
##
## $diag_1
##
## $diag_2
##
## $diag_3
##
## $max_glu_serum
##
## $A1Cresult
##
## $insulin
##
## $change
##
## $diabetesMed
#Dummy
library(caret)
## Loading required package: lattice
diabetic_largedata$discharge_disposition_id <- as.factor(diabetic_largedata$discharge_disposition_id)
diabetic_largedata$admission_type_id <- as.factor(diabetic_largedata$admission_type_id)
diabetic_largedata$admission_source_id <- as.factor(diabetic_largedata$admission_source_id)
dummy1 <- predict(dummyVars(~race+gender+age+discharge_disposition_id + admission_type_id + max_glu_serum + A1Cresult + admission_source_id+insulin+change+diabetesMed, data = diabetic_largedata), newdata = diabetic_largedata)
head(dummy1)
## race.AfricanAmerican race.Asian race.Caucasian race.Hispanic race.Other
## 163 0 0 1 0 0
## 461 1 0 0 0 0
## 594 0 0 1 0 0
## 697 0 0 0 0 1
## 772 0 0 1 0 0
## 824 0 0 1 0 0
## gender.Female gender.Male age.[10-20) age.[20-30) age.[30-40) age.[40-50)
## 163 0 1 0 0 0 0
## 461 1 0 0 0 0 0
## 594 1 0 0 0 0 0
## 697 0 1 0 0 0 0
## 772 1 0 0 0 1 0
## 824 0 1 0 0 0 0
## age.[50-60) age.[60-70) age.[70-80) age.[80-90) discharge_disposition_id.1
## 163 0 0 0 1 0
## 461 0 0 1 0 1
## 594 1 0 0 0 1
## 697 0 0 1 0 0
## 772 0 0 0 0 1
## 824 0 0 0 1 1
## discharge_disposition_id.2 discharge_disposition_id.3
## 163 0 1
## 461 0 0
## 594 0 0
## 697 0 0
## 772 0 0
## 824 0 0
## discharge_disposition_id.5 discharge_disposition_id.6
## 163 0 0
## 461 0 0
## 594 0 0
## 697 0 1
## 772 0 0
## 824 0 0
## discharge_disposition_id.7 discharge_disposition_id.10
## 163 0 0
## 461 0 0
## 594 0 0
## 697 0 0
## 772 0 0
## 824 0 0
## discharge_disposition_id.11 discharge_disposition_id.13 admission_type_id.1
## 163 0 0 0
## 461 0 0 0
## 594 0 0 0
## 697 0 0 0
## 772 0 0 0
## 824 0 0 0
## admission_type_id.2 admission_type_id.3 admission_type_id.6
## 163 0 0 1
## 461 0 0 1
## 594 0 0 1
## 697 0 0 1
## 772 0 0 1
## 824 0 0 1
## max_glu_serum.>200 max_glu_serum.>300 max_glu_serum.Norm A1Cresult.>7
## 163 1 0 0 0
## 461 0 1 0 0
## 594 0 1 0 0
## 697 1 0 0 1
## 772 0 0 1 1
## 824 0 1 0 1
## A1Cresult.>8 A1Cresult.Norm admission_source_id.1 admission_source_id.2
## 163 0 1 0 0
## 461 1 0 0 0
## 594 1 0 0 0
## 697 0 0 0 0
## 772 0 0 0 1
## 824 0 0 0 0
## admission_source_id.7 insulin.Down insulin.No insulin.Steady insulin.Up
## 163 1 0 1 0 0
## 461 1 0 0 0 1
## 594 1 0 0 1 0
## 697 1 0 1 0 0
## 772 0 0 1 0 0
## 824 1 0 1 0 0
## change.Ch change.No diabetesMed.No diabetesMed.Yes
## 163 0 1 1 0
## 461 1 0 0 1
## 594 0 1 0 1
## 697 0 1 0 1
## 772 0 1 1 0
## 824 0 1 0 1
colnames(dummy1)
## [1] "race.AfricanAmerican" "race.Asian"
## [3] "race.Caucasian" "race.Hispanic"
## [5] "race.Other" "gender.Female"
## [7] "gender.Male" "age.[10-20)"
## [9] "age.[20-30)" "age.[30-40)"
## [11] "age.[40-50)" "age.[50-60)"
## [13] "age.[60-70)" "age.[70-80)"
## [15] "age.[80-90)" "discharge_disposition_id.1"
## [17] "discharge_disposition_id.2" "discharge_disposition_id.3"
## [19] "discharge_disposition_id.5" "discharge_disposition_id.6"
## [21] "discharge_disposition_id.7" "discharge_disposition_id.10"
## [23] "discharge_disposition_id.11" "discharge_disposition_id.13"
## [25] "admission_type_id.1" "admission_type_id.2"
## [27] "admission_type_id.3" "admission_type_id.6"
## [29] "max_glu_serum.>200" "max_glu_serum.>300"
## [31] "max_glu_serum.Norm" "A1Cresult.>7"
## [33] "A1Cresult.>8" "A1Cresult.Norm"
## [35] "admission_source_id.1" "admission_source_id.2"
## [37] "admission_source_id.7" "insulin.Down"
## [39] "insulin.No" "insulin.Steady"
## [41] "insulin.Up" "change.Ch"
## [43] "change.No" "diabetesMed.No"
## [45] "diabetesMed.Yes"
colnames(dummy1)[colnames(dummy1) == "A1Cresult.>7"] <- "A1Cresult7"
colnames(dummy1)[colnames(dummy1) == "A1Cresult.>8"] <- "A1Cresult8"
colnames(dummy1)[colnames(dummy1) == "max_glu_serum.>300"] <- "max_glu_serum300"
colnames(dummy1)[colnames(dummy1) == "max_glu_serum.>200"] <- "max_glu_serum200"
colnames(dummy1)[colnames(dummy1) == "age.[10-20)"] <- "agefirst"
colnames(dummy1)[colnames(dummy1) == "age.[20-30)"] <- "agesecond"
colnames(dummy1)[colnames(dummy1) == "age.[30-40)"] <- "agethird"
colnames(dummy1)[colnames(dummy1) == "age.[40-50)"] <- "ageforth"
colnames(dummy1)[colnames(dummy1) == "age.[50-60)"] <- "agefifth"
colnames(dummy1)[colnames(dummy1) == "age.[60-70)"] <- "agesixth"
colnames(dummy1)[colnames(dummy1) == "age.[70-80)"] <- "ageseventh"
colnames(dummy1)[colnames(dummy1) == "age.[80-90)"] <- "ageeighth"
colnames(dummy1)
## [1] "race.AfricanAmerican" "race.Asian"
## [3] "race.Caucasian" "race.Hispanic"
## [5] "race.Other" "gender.Female"
## [7] "gender.Male" "agefirst"
## [9] "agesecond" "agethird"
## [11] "ageforth" "agefifth"
## [13] "agesixth" "ageseventh"
## [15] "ageeighth" "discharge_disposition_id.1"
## [17] "discharge_disposition_id.2" "discharge_disposition_id.3"
## [19] "discharge_disposition_id.5" "discharge_disposition_id.6"
## [21] "discharge_disposition_id.7" "discharge_disposition_id.10"
## [23] "discharge_disposition_id.11" "discharge_disposition_id.13"
## [25] "admission_type_id.1" "admission_type_id.2"
## [27] "admission_type_id.3" "admission_type_id.6"
## [29] "max_glu_serum200" "max_glu_serum300"
## [31] "max_glu_serum.Norm" "A1Cresult7"
## [33] "A1Cresult8" "A1Cresult.Norm"
## [35] "admission_source_id.1" "admission_source_id.2"
## [37] "admission_source_id.7" "insulin.Down"
## [39] "insulin.No" "insulin.Steady"
## [41] "insulin.Up" "change.Ch"
## [43] "change.No" "diabetesMed.No"
## [45] "diabetesMed.Yes"
cbind(colnames(diabetic_largedata))
## [,1]
## [1,] "race"
## [2,] "gender"
## [3,] "age"
## [4,] "admission_type_id"
## [5,] "discharge_disposition_id"
## [6,] "admission_source_id"
## [7,] "time_in_hospital"
## [8,] "num_lab_procedures"
## [9,] "num_procedures"
## [10,] "num_medications"
## [11,] "number_outpatient"
## [12,] "number_emergency"
## [13,] "number_inpatient"
## [14,] "diag_1"
## [15,] "diag_2"
## [16,] "diag_3"
## [17,] "number_diagnoses"
## [18,] "max_glu_serum"
## [19,] "A1Cresult"
## [20,] "insulin"
## [21,] "change"
## [22,] "diabetesMed"
## [23,] "readmitted"
## [24,] "diag_circ"
## [25,] "diag_resp"
## [26,] "diag_dig"
## [27,] "diag_diab"
## [28,] "diag_inj"
## [29,] "diag_musc"
## [30,] "diag_geni"
## [31,] "diag_neop"
## [32,] "diag_other"
diabetic_largedata<-cbind(diabetic_largedata[,-c(1:6 ,14:16 ,18:22 )],dummy1)
head(diabetic_largedata)
## time_in_hospital num_lab_procedures num_procedures num_medications
## 163 5 47 1 6
## 461 10 72 1 19
## 594 2 61 0 5
## 697 11 71 1 20
## 772 14 43 0 11
## 824 7 105 3 16
## number_outpatient number_emergency number_inpatient number_diagnoses
## 163 0 0 0 5
## 461 0 0 0 5
## 594 0 0 0 5
## 697 0 0 0 5
## 772 0 0 0 3
## 824 0 0 0 5
## readmitted diag_circ diag_resp diag_dig diag_diab diag_inj diag_musc
## 163 YES 1 0 0 0 0 0
## 461 YES 0 0 0 1 0 0
## 594 NO 0 0 0 1 0 0
## 697 NO 0 0 0 1 1 0
## 772 YES 0 0 1 1 0 0
## 824 YES 1 0 0 1 0 0
## diag_geni diag_neop diag_other race.AfricanAmerican race.Asian
## 163 0 0 1 0 0
## 461 0 0 1 1 0
## 594 0 0 1 0 0
## 697 0 0 0 0 0
## 772 0 0 1 0 0
## 824 0 0 1 0 0
## race.Caucasian race.Hispanic race.Other gender.Female gender.Male agefirst
## 163 1 0 0 0 1 0
## 461 0 0 0 1 0 0
## 594 1 0 0 1 0 0
## 697 0 0 1 0 1 0
## 772 1 0 0 1 0 0
## 824 1 0 0 0 1 0
## agesecond agethird ageforth agefifth agesixth ageseventh ageeighth
## 163 0 0 0 0 0 0 1
## 461 0 0 0 0 0 1 0
## 594 0 0 0 1 0 0 0
## 697 0 0 0 0 0 1 0
## 772 0 1 0 0 0 0 0
## 824 0 0 0 0 0 0 1
## discharge_disposition_id.1 discharge_disposition_id.2
## 163 0 0
## 461 1 0
## 594 1 0
## 697 0 0
## 772 1 0
## 824 1 0
## discharge_disposition_id.3 discharge_disposition_id.5
## 163 1 0
## 461 0 0
## 594 0 0
## 697 0 0
## 772 0 0
## 824 0 0
## discharge_disposition_id.6 discharge_disposition_id.7
## 163 0 0
## 461 0 0
## 594 0 0
## 697 1 0
## 772 0 0
## 824 0 0
## discharge_disposition_id.10 discharge_disposition_id.11
## 163 0 0
## 461 0 0
## 594 0 0
## 697 0 0
## 772 0 0
## 824 0 0
## discharge_disposition_id.13 admission_type_id.1 admission_type_id.2
## 163 0 0 0
## 461 0 0 0
## 594 0 0 0
## 697 0 0 0
## 772 0 0 0
## 824 0 0 0
## admission_type_id.3 admission_type_id.6 max_glu_serum200 max_glu_serum300
## 163 0 1 1 0
## 461 0 1 0 1
## 594 0 1 0 1
## 697 0 1 1 0
## 772 0 1 0 0
## 824 0 1 0 1
## max_glu_serum.Norm A1Cresult7 A1Cresult8 A1Cresult.Norm
## 163 0 0 0 1
## 461 0 0 1 0
## 594 0 0 1 0
## 697 0 1 0 0
## 772 1 1 0 0
## 824 0 1 0 0
## admission_source_id.1 admission_source_id.2 admission_source_id.7
## 163 0 0 1
## 461 0 0 1
## 594 0 0 1
## 697 0 0 1
## 772 0 1 0
## 824 0 0 1
## insulin.Down insulin.No insulin.Steady insulin.Up change.Ch change.No
## 163 0 1 0 0 0 1
## 461 0 0 0 1 1 0
## 594 0 0 1 0 0 1
## 697 0 1 0 0 0 1
## 772 0 1 0 0 0 1
## 824 0 1 0 0 0 1
## diabetesMed.No diabetesMed.Yes
## 163 1 0
## 461 0 1
## 594 0 1
## 697 0 1
## 772 1 0
## 824 0 1
#replacing "." with empthy variable
colnames(diabetic_largedata) <- gsub("\\.", "", colnames(diabetic_largedata))
cbind(colnames(diabetic_largedata))
## [,1]
## [1,] "time_in_hospital"
## [2,] "num_lab_procedures"
## [3,] "num_procedures"
## [4,] "num_medications"
## [5,] "number_outpatient"
## [6,] "number_emergency"
## [7,] "number_inpatient"
## [8,] "number_diagnoses"
## [9,] "readmitted"
## [10,] "diag_circ"
## [11,] "diag_resp"
## [12,] "diag_dig"
## [13,] "diag_diab"
## [14,] "diag_inj"
## [15,] "diag_musc"
## [16,] "diag_geni"
## [17,] "diag_neop"
## [18,] "diag_other"
## [19,] "raceAfricanAmerican"
## [20,] "raceAsian"
## [21,] "raceCaucasian"
## [22,] "raceHispanic"
## [23,] "raceOther"
## [24,] "genderFemale"
## [25,] "genderMale"
## [26,] "agefirst"
## [27,] "agesecond"
## [28,] "agethird"
## [29,] "ageforth"
## [30,] "agefifth"
## [31,] "agesixth"
## [32,] "ageseventh"
## [33,] "ageeighth"
## [34,] "discharge_disposition_id1"
## [35,] "discharge_disposition_id2"
## [36,] "discharge_disposition_id3"
## [37,] "discharge_disposition_id5"
## [38,] "discharge_disposition_id6"
## [39,] "discharge_disposition_id7"
## [40,] "discharge_disposition_id10"
## [41,] "discharge_disposition_id11"
## [42,] "discharge_disposition_id13"
## [43,] "admission_type_id1"
## [44,] "admission_type_id2"
## [45,] "admission_type_id3"
## [46,] "admission_type_id6"
## [47,] "max_glu_serum200"
## [48,] "max_glu_serum300"
## [49,] "max_glu_serumNorm"
## [50,] "A1Cresult7"
## [51,] "A1Cresult8"
## [52,] "A1CresultNorm"
## [53,] "admission_source_id1"
## [54,] "admission_source_id2"
## [55,] "admission_source_id7"
## [56,] "insulinDown"
## [57,] "insulinNo"
## [58,] "insulinSteady"
## [59,] "insulinUp"
## [60,] "changeCh"
## [61,] "changeNo"
## [62,] "diabetesMedNo"
## [63,] "diabetesMedYes"
summary(diabetic_largedata)
## time_in_hospital num_lab_procedures num_procedures num_medications
## Min. : 1.000 Min. : 31.0 Min. :0.0000 Min. : 1.00
## 1st Qu.: 3.000 1st Qu.: 54.0 1st Qu.:0.0000 1st Qu.: 9.00
## Median : 5.000 Median : 63.0 Median :0.0000 Median :14.00
## Mean : 5.398 Mean : 64.2 Mean :0.8443 Mean :14.54
## 3rd Qu.: 7.000 3rd Qu.: 74.0 3rd Qu.:1.0000 3rd Qu.:19.00
## Max. :14.000 Max. :106.0 Max. :6.0000 Max. :35.00
## number_outpatient number_emergency number_inpatient number_diagnoses
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :3.000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:5.000
## Median :0.0000 Median :0.000 Median :0.0000 Median :6.000
## Mean :0.1592 Mean :0.173 Mean :0.6678 Mean :5.958
## 3rd Qu.:0.0000 3rd Qu.:0.000 3rd Qu.:1.0000 3rd Qu.:6.000
## Max. :6.0000 Max. :9.000 Max. :9.0000 Max. :9.000
## readmitted diag_circ diag_resp diag_dig
## Length:289 Min. :0.0000 Min. :0.0000 Min. :0.0000
## Class :character 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Mode :character Median :1.0000 Median :0.0000 Median :0.0000
## Mean :0.5363 Mean :0.3114 Mean :0.1211
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## diag_diab diag_inj diag_musc diag_geni
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :1.0000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.6263 Mean :0.06574 Mean :0.04498 Mean :0.1453
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## diag_neop diag_other raceAfricanAmerican raceAsian
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :1.0000 Median :0.0000 Median :0.00000
## Mean :0.04152 Mean :0.5467 Mean :0.1765 Mean :0.02422
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## raceCaucasian raceHispanic raceOther genderFemale
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :0.00000 Median :1.0000
## Mean :0.6228 Mean :0.1315 Mean :0.04498 Mean :0.5813
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## genderMale agefirst agesecond agethird
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.4187 Mean :0.01384 Mean :0.01384 Mean :0.05882
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## ageforth agefifth agesixth ageseventh
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1315 Mean :0.2249 Mean :0.2076 Mean :0.2076
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## ageeighth discharge_disposition_id1 discharge_disposition_id2
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :0.0000
## Mean :0.1419 Mean :0.6574 Mean :0.0692
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## discharge_disposition_id3 discharge_disposition_id5 discharge_disposition_id6
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.1142 Mean :0.0173 Mean :0.09343
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000
## discharge_disposition_id7 discharge_disposition_id10
## Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000
## Mean :0.03114 Mean :0.00346
## 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000
## discharge_disposition_id11 discharge_disposition_id13 admission_type_id1
## Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.01038 Mean :0.00346 Mean :0.1799
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000
## admission_type_id2 admission_type_id3 admission_type_id6 max_glu_serum200
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :1.0000 Median :0.0000
## Mean :0.01384 Mean :0.00692 Mean :0.7993 Mean :0.2388
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## max_glu_serum300 max_glu_serumNorm A1Cresult7 A1Cresult8
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.000 Median :1.0000
## Mean :0.4291 Mean :0.3322 Mean :0.218 Mean :0.5917
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.000 Max. :1.0000
## A1CresultNorm admission_source_id1 admission_source_id2
## Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.1903 Mean :0.07958 Mean :0.00692
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.00000
## admission_source_id7 insulinDown insulinNo insulinSteady
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :1.000 Median :0.0000
## Mean :0.9135 Mean :0.0692 Mean :0.699 Mean :0.1834
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.000 Max. :1.0000
## insulinUp changeCh changeNo diabetesMedNo
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :1.0000 Median :0.0000
## Mean :0.04844 Mean :0.2907 Mean :0.7093 Mean :0.4083
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## diabetesMedYes
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.5917
## 3rd Qu.:1.0000
## Max. :1.0000
nrow(diabetic_largedata)
## [1] 289
ncol(diabetic_largedata)
## [1] 63
#Bringing outcome variable to the last column
cbind(colnames(diabetic_largedata))
## [,1]
## [1,] "time_in_hospital"
## [2,] "num_lab_procedures"
## [3,] "num_procedures"
## [4,] "num_medications"
## [5,] "number_outpatient"
## [6,] "number_emergency"
## [7,] "number_inpatient"
## [8,] "number_diagnoses"
## [9,] "readmitted"
## [10,] "diag_circ"
## [11,] "diag_resp"
## [12,] "diag_dig"
## [13,] "diag_diab"
## [14,] "diag_inj"
## [15,] "diag_musc"
## [16,] "diag_geni"
## [17,] "diag_neop"
## [18,] "diag_other"
## [19,] "raceAfricanAmerican"
## [20,] "raceAsian"
## [21,] "raceCaucasian"
## [22,] "raceHispanic"
## [23,] "raceOther"
## [24,] "genderFemale"
## [25,] "genderMale"
## [26,] "agefirst"
## [27,] "agesecond"
## [28,] "agethird"
## [29,] "ageforth"
## [30,] "agefifth"
## [31,] "agesixth"
## [32,] "ageseventh"
## [33,] "ageeighth"
## [34,] "discharge_disposition_id1"
## [35,] "discharge_disposition_id2"
## [36,] "discharge_disposition_id3"
## [37,] "discharge_disposition_id5"
## [38,] "discharge_disposition_id6"
## [39,] "discharge_disposition_id7"
## [40,] "discharge_disposition_id10"
## [41,] "discharge_disposition_id11"
## [42,] "discharge_disposition_id13"
## [43,] "admission_type_id1"
## [44,] "admission_type_id2"
## [45,] "admission_type_id3"
## [46,] "admission_type_id6"
## [47,] "max_glu_serum200"
## [48,] "max_glu_serum300"
## [49,] "max_glu_serumNorm"
## [50,] "A1Cresult7"
## [51,] "A1Cresult8"
## [52,] "A1CresultNorm"
## [53,] "admission_source_id1"
## [54,] "admission_source_id2"
## [55,] "admission_source_id7"
## [56,] "insulinDown"
## [57,] "insulinNo"
## [58,] "insulinSteady"
## [59,] "insulinUp"
## [60,] "changeCh"
## [61,] "changeNo"
## [62,] "diabetesMedNo"
## [63,] "diabetesMedYes"
diabetic_largedata<-diabetic_largedata[c(1:8, 10:63, 9)]
head(diabetic_largedata)
## time_in_hospital num_lab_procedures num_procedures num_medications
## 163 5 47 1 6
## 461 10 72 1 19
## 594 2 61 0 5
## 697 11 71 1 20
## 772 14 43 0 11
## 824 7 105 3 16
## number_outpatient number_emergency number_inpatient number_diagnoses
## 163 0 0 0 5
## 461 0 0 0 5
## 594 0 0 0 5
## 697 0 0 0 5
## 772 0 0 0 3
## 824 0 0 0 5
## diag_circ diag_resp diag_dig diag_diab diag_inj diag_musc diag_geni
## 163 1 0 0 0 0 0 0
## 461 0 0 0 1 0 0 0
## 594 0 0 0 1 0 0 0
## 697 0 0 0 1 1 0 0
## 772 0 0 1 1 0 0 0
## 824 1 0 0 1 0 0 0
## diag_neop diag_other raceAfricanAmerican raceAsian raceCaucasian
## 163 0 1 0 0 1
## 461 0 1 1 0 0
## 594 0 1 0 0 1
## 697 0 0 0 0 0
## 772 0 1 0 0 1
## 824 0 1 0 0 1
## raceHispanic raceOther genderFemale genderMale agefirst agesecond agethird
## 163 0 0 0 1 0 0 0
## 461 0 0 1 0 0 0 0
## 594 0 0 1 0 0 0 0
## 697 0 1 0 1 0 0 0
## 772 0 0 1 0 0 0 1
## 824 0 0 0 1 0 0 0
## ageforth agefifth agesixth ageseventh ageeighth discharge_disposition_id1
## 163 0 0 0 0 1 0
## 461 0 0 0 1 0 1
## 594 0 1 0 0 0 1
## 697 0 0 0 1 0 0
## 772 0 0 0 0 0 1
## 824 0 0 0 0 1 1
## discharge_disposition_id2 discharge_disposition_id3
## 163 0 1
## 461 0 0
## 594 0 0
## 697 0 0
## 772 0 0
## 824 0 0
## discharge_disposition_id5 discharge_disposition_id6
## 163 0 0
## 461 0 0
## 594 0 0
## 697 0 1
## 772 0 0
## 824 0 0
## discharge_disposition_id7 discharge_disposition_id10
## 163 0 0
## 461 0 0
## 594 0 0
## 697 0 0
## 772 0 0
## 824 0 0
## discharge_disposition_id11 discharge_disposition_id13 admission_type_id1
## 163 0 0 0
## 461 0 0 0
## 594 0 0 0
## 697 0 0 0
## 772 0 0 0
## 824 0 0 0
## admission_type_id2 admission_type_id3 admission_type_id6 max_glu_serum200
## 163 0 0 1 1
## 461 0 0 1 0
## 594 0 0 1 0
## 697 0 0 1 1
## 772 0 0 1 0
## 824 0 0 1 0
## max_glu_serum300 max_glu_serumNorm A1Cresult7 A1Cresult8 A1CresultNorm
## 163 0 0 0 0 1
## 461 1 0 0 1 0
## 594 1 0 0 1 0
## 697 0 0 1 0 0
## 772 0 1 1 0 0
## 824 1 0 1 0 0
## admission_source_id1 admission_source_id2 admission_source_id7 insulinDown
## 163 0 0 1 0
## 461 0 0 1 0
## 594 0 0 1 0
## 697 0 0 1 0
## 772 0 1 0 0
## 824 0 0 1 0
## insulinNo insulinSteady insulinUp changeCh changeNo diabetesMedNo
## 163 1 0 0 0 1 1
## 461 0 0 1 1 0 0
## 594 0 1 0 0 1 0
## 697 1 0 0 0 1 0
## 772 1 0 0 0 1 1
## 824 1 0 0 0 1 0
## diabetesMedYes readmitted
## 163 0 YES
## 461 1 YES
## 594 1 NO
## 697 1 NO
## 772 0 YES
## 824 1 YES
cbind(colnames(diabetic_largedata))
## [,1]
## [1,] "time_in_hospital"
## [2,] "num_lab_procedures"
## [3,] "num_procedures"
## [4,] "num_medications"
## [5,] "number_outpatient"
## [6,] "number_emergency"
## [7,] "number_inpatient"
## [8,] "number_diagnoses"
## [9,] "diag_circ"
## [10,] "diag_resp"
## [11,] "diag_dig"
## [12,] "diag_diab"
## [13,] "diag_inj"
## [14,] "diag_musc"
## [15,] "diag_geni"
## [16,] "diag_neop"
## [17,] "diag_other"
## [18,] "raceAfricanAmerican"
## [19,] "raceAsian"
## [20,] "raceCaucasian"
## [21,] "raceHispanic"
## [22,] "raceOther"
## [23,] "genderFemale"
## [24,] "genderMale"
## [25,] "agefirst"
## [26,] "agesecond"
## [27,] "agethird"
## [28,] "ageforth"
## [29,] "agefifth"
## [30,] "agesixth"
## [31,] "ageseventh"
## [32,] "ageeighth"
## [33,] "discharge_disposition_id1"
## [34,] "discharge_disposition_id2"
## [35,] "discharge_disposition_id3"
## [36,] "discharge_disposition_id5"
## [37,] "discharge_disposition_id6"
## [38,] "discharge_disposition_id7"
## [39,] "discharge_disposition_id10"
## [40,] "discharge_disposition_id11"
## [41,] "discharge_disposition_id13"
## [42,] "admission_type_id1"
## [43,] "admission_type_id2"
## [44,] "admission_type_id3"
## [45,] "admission_type_id6"
## [46,] "max_glu_serum200"
## [47,] "max_glu_serum300"
## [48,] "max_glu_serumNorm"
## [49,] "A1Cresult7"
## [50,] "A1Cresult8"
## [51,] "A1CresultNorm"
## [52,] "admission_source_id1"
## [53,] "admission_source_id2"
## [54,] "admission_source_id7"
## [55,] "insulinDown"
## [56,] "insulinNo"
## [57,] "insulinSteady"
## [58,] "insulinUp"
## [59,] "changeCh"
## [60,] "changeNo"
## [61,] "diabetesMedNo"
## [62,] "diabetesMedYes"
## [63,] "readmitted"
#Omitting the columns that their variance is near to 0 like 0.01
#lapply(data.frame(dummy1),function(x)table(x))
#some of the columns has so many 0s
table_list <- lapply(data.frame(dummy1), function(x) table(x))
plot_table <- function(tbl, var_name) {
barplot(tbl, main = var_name, col = rainbow(length(tbl)),
xlab = "", ylab = "", border = NA)
legend("topright", legend = names(tbl), fill = rainbow(length(tbl)),
bty = "n", cex = 0.8)
}
mapply(plot_table, table_list, var_name = colnames(dummy1))
## race.AfricanAmerican race.Asian race.Caucasian race.Hispanic race.Other
## rect list,4 list,4 list,4 list,4 list,4
## text list,2 list,2 list,2 list,2 list,2
## gender.Female gender.Male agefirst agesecond agethird ageforth agefifth
## rect list,4 list,4 list,4 list,4 list,4 list,4 list,4
## text list,2 list,2 list,2 list,2 list,2 list,2 list,2
## agesixth ageseventh ageeighth discharge_disposition_id.1
## rect list,4 list,4 list,4 list,4
## text list,2 list,2 list,2 list,2
## discharge_disposition_id.2 discharge_disposition_id.3
## rect list,4 list,4
## text list,2 list,2
## discharge_disposition_id.5 discharge_disposition_id.6
## rect list,4 list,4
## text list,2 list,2
## discharge_disposition_id.7 discharge_disposition_id.10
## rect list,4 list,4
## text list,2 list,2
## discharge_disposition_id.11 discharge_disposition_id.13
## rect list,4 list,4
## text list,2 list,2
## admission_type_id.1 admission_type_id.2 admission_type_id.3
## rect list,4 list,4 list,4
## text list,2 list,2 list,2
## admission_type_id.6 max_glu_serum200 max_glu_serum300 max_glu_serum.Norm
## rect list,4 list,4 list,4 list,4
## text list,2 list,2 list,2 list,2
## A1Cresult7 A1Cresult8 A1Cresult.Norm admission_source_id.1
## rect list,4 list,4 list,4 list,4
## text list,2 list,2 list,2 list,2
## admission_source_id.2 admission_source_id.7 insulin.Down insulin.No
## rect list,4 list,4 list,4 list,4
## text list,2 list,2 list,2 list,2
## insulin.Steady insulin.Up change.Ch change.No diabetesMed.No
## rect list,4 list,4 list,4 list,4 list,4
## text list,2 list,2 list,2 list,2 list,2
## diabetesMed.Yes
## rect list,4
## text list,2
lapply(data.frame(dummy1),function(x)var(x))
## $race.AfricanAmerican
## [1] 0.1458333
##
## $race.Asian
## [1] 0.02371684
##
## $race.Caucasian
## [1] 0.2357266
##
## $race.Hispanic
## [1] 0.1145953
##
## $race.Other
## [1] 0.04310842
##
## $gender.Female
## [1] 0.244233
##
## $gender.Male
## [1] 0.244233
##
## $agefirst
## [1] 0.01369666
##
## $agesecond
## [1] 0.01369666
##
## $agethird
## [1] 0.05555556
##
## $ageforth
## [1] 0.1145953
##
## $agefifth
## [1] 0.1749327
##
## $agesixth
## [1] 0.1650807
##
## $ageseventh
## [1] 0.1650807
##
## $ageeighth
## [1] 0.1221646
##
## $discharge_disposition_id.1
## [1] 0.2259948
##
## $discharge_disposition_id.2
## [1] 0.0646386
##
## $discharge_disposition_id.3
## [1] 0.1014994
##
## $discharge_disposition_id.5
## [1] 0.01706075
##
## $discharge_disposition_id.6
## [1] 0.08499135
##
## $discharge_disposition_id.7
## [1] 0.03027682
##
## $discharge_disposition_id.10
## [1] 0.003460208
##
## $discharge_disposition_id.11
## [1] 0.01030854
##
## $discharge_disposition_id.13
## [1] 0.003460208
##
## $admission_type_id.1
## [1] 0.1480681
##
## $admission_type_id.2
## [1] 0.01369666
##
## $admission_type_id.3
## [1] 0.006896386
##
## $admission_type_id.6
## [1] 0.1609717
##
## $max_glu_serum200
## [1] 0.1823818
##
## $max_glu_serum300
## [1] 0.2458189
##
## $max_glu_serum.Norm
## [1] 0.2226067
##
## $A1Cresult7
## [1] 0.171064
##
## $A1Cresult8
## [1] 0.2424308
##
## $A1Cresult.Norm
## [1] 0.154628
##
## $admission_source_id.1
## [1] 0.07350538
##
## $admission_source_id.2
## [1] 0.006896386
##
## $admission_source_id.7
## [1] 0.07929642
##
## $insulin.Down
## [1] 0.0646386
##
## $insulin.No
## [1] 0.2111448
##
## $insulin.Steady
## [1] 0.1502787
##
## $insulin.Up
## [1] 0.04625625
##
## $change.Ch
## [1] 0.2068916
##
## $change.No
## [1] 0.2068916
##
## $diabetesMed.No
## [1] 0.2424308
##
## $diabetesMed.Yes
## [1] 0.2424308
#omiting the columns with the nearZero variance
variances <- lapply(diabetic_largedata,function(x)var(x))
near_zero <- which(variances < 0.05)
colnames(diabetic_largedata)[near_zero]
## [1] "diag_musc" "diag_neop"
## [3] "raceAsian" "raceOther"
## [5] "agefirst" "agesecond"
## [7] "discharge_disposition_id5" "discharge_disposition_id7"
## [9] "discharge_disposition_id10" "discharge_disposition_id11"
## [11] "discharge_disposition_id13" "admission_type_id2"
## [13] "admission_type_id3" "admission_source_id2"
## [15] "insulinUp"
cols_to_remove <- c("diag_musc","diag_neop","raceAsian","raceOther","agefirst","agesecond","discharge_disposition_id5","discharge_disposition_id7","discharge_disposition_id10","discharge_disposition_id11","discharge_disposition_id13","admission_type_id2","admission_type_id3","admission_source_id2","insulinUp")
diabetic_largedata <- diabetic_largedata[, !(colnames(diabetic_largedata) %in% cols_to_remove)]
cbind(colnames(diabetic_largedata))
## [,1]
## [1,] "time_in_hospital"
## [2,] "num_lab_procedures"
## [3,] "num_procedures"
## [4,] "num_medications"
## [5,] "number_outpatient"
## [6,] "number_emergency"
## [7,] "number_inpatient"
## [8,] "number_diagnoses"
## [9,] "diag_circ"
## [10,] "diag_resp"
## [11,] "diag_dig"
## [12,] "diag_diab"
## [13,] "diag_inj"
## [14,] "diag_geni"
## [15,] "diag_other"
## [16,] "raceAfricanAmerican"
## [17,] "raceCaucasian"
## [18,] "raceHispanic"
## [19,] "genderFemale"
## [20,] "genderMale"
## [21,] "agethird"
## [22,] "ageforth"
## [23,] "agefifth"
## [24,] "agesixth"
## [25,] "ageseventh"
## [26,] "ageeighth"
## [27,] "discharge_disposition_id1"
## [28,] "discharge_disposition_id2"
## [29,] "discharge_disposition_id3"
## [30,] "discharge_disposition_id6"
## [31,] "admission_type_id1"
## [32,] "admission_type_id6"
## [33,] "max_glu_serum200"
## [34,] "max_glu_serum300"
## [35,] "max_glu_serumNorm"
## [36,] "A1Cresult7"
## [37,] "A1Cresult8"
## [38,] "A1CresultNorm"
## [39,] "admission_source_id1"
## [40,] "admission_source_id7"
## [41,] "insulinDown"
## [42,] "insulinNo"
## [43,] "insulinSteady"
## [44,] "changeCh"
## [45,] "changeNo"
## [46,] "diabetesMedNo"
## [47,] "diabetesMedYes"
## [48,] "readmitted"
nzv_cols <- nearZeroVar(diabetic_largedata[, -48], saveMetrics = TRUE)$nzv
cbind(nzv_cols)
## nzv_cols
## [1,] FALSE
## [2,] FALSE
## [3,] FALSE
## [4,] FALSE
## [5,] TRUE
## [6,] TRUE
## [7,] FALSE
## [8,] FALSE
## [9,] FALSE
## [10,] FALSE
## [11,] FALSE
## [12,] FALSE
## [13,] FALSE
## [14,] FALSE
## [15,] FALSE
## [16,] FALSE
## [17,] FALSE
## [18,] FALSE
## [19,] FALSE
## [20,] FALSE
## [21,] FALSE
## [22,] FALSE
## [23,] FALSE
## [24,] FALSE
## [25,] FALSE
## [26,] FALSE
## [27,] FALSE
## [28,] FALSE
## [29,] FALSE
## [30,] FALSE
## [31,] FALSE
## [32,] FALSE
## [33,] FALSE
## [34,] FALSE
## [35,] FALSE
## [36,] FALSE
## [37,] FALSE
## [38,] FALSE
## [39,] FALSE
## [40,] FALSE
## [41,] FALSE
## [42,] FALSE
## [43,] FALSE
## [44,] FALSE
## [45,] FALSE
## [46,] FALSE
## [47,] FALSE
diabetic_largedata<-diabetic_largedata[,-c(5,6)]
str(diabetic_largedata)
## 'data.frame': 289 obs. of 46 variables:
## $ time_in_hospital : int 5 10 2 11 14 7 2 3 2 4 ...
## $ num_lab_procedures : int 47 72 61 71 43 105 66 76 43 41 ...
## $ num_procedures : int 1 1 0 1 0 3 0 0 0 1 ...
## $ num_medications : int 6 19 5 20 11 16 3 9 13 8 ...
## $ number_inpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_diagnoses : int 5 5 5 5 3 5 3 5 5 3 ...
## $ diag_circ : num 1 0 0 0 0 1 1 0 0 0 ...
## $ diag_resp : num 0 0 0 0 0 0 0 0 0 1 ...
## $ diag_dig : num 0 0 0 0 1 0 0 0 0 0 ...
## $ diag_diab : num 0 1 1 1 1 1 1 0 1 1 ...
## $ diag_inj : num 0 0 0 1 0 0 0 0 1 0 ...
## $ diag_geni : num 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_other : num 1 1 1 0 1 1 1 1 1 0 ...
## $ raceAfricanAmerican : num 0 1 0 0 0 0 0 0 0 0 ...
## $ raceCaucasian : num 1 0 1 0 1 1 0 1 1 1 ...
## $ raceHispanic : num 0 0 0 0 0 0 1 0 0 0 ...
## $ genderFemale : num 0 1 1 0 1 0 1 1 1 0 ...
## $ genderMale : num 1 0 0 1 0 1 0 0 0 1 ...
## $ agethird : num 0 0 0 0 1 0 0 0 0 0 ...
## $ ageforth : num 0 0 0 0 0 0 0 1 0 0 ...
## $ agefifth : num 0 0 1 0 0 0 1 0 1 1 ...
## $ agesixth : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ageseventh : num 0 1 0 1 0 0 0 0 0 0 ...
## $ ageeighth : num 1 0 0 0 0 1 0 0 0 0 ...
## $ discharge_disposition_id1: num 0 1 1 0 1 1 1 1 1 0 ...
## $ discharge_disposition_id2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id3: num 1 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id6: num 0 0 0 1 0 0 0 0 0 0 ...
## $ admission_type_id1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id6 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ max_glu_serum200 : num 1 0 0 1 0 0 0 0 0 1 ...
## $ max_glu_serum300 : num 0 1 1 0 0 1 0 1 1 0 ...
## $ max_glu_serumNorm : num 0 0 0 0 1 0 1 0 0 0 ...
## $ A1Cresult7 : num 0 0 0 1 1 1 1 1 1 0 ...
## $ A1Cresult8 : num 0 1 1 0 0 0 0 0 0 1 ...
## $ A1CresultNorm : num 1 0 0 0 0 0 0 0 0 0 ...
## $ admission_source_id1 : num 0 0 0 0 0 0 0 0 0 1 ...
## $ admission_source_id7 : num 1 1 1 1 0 1 1 1 1 0 ...
## $ insulinDown : num 0 0 0 0 0 0 0 0 0 0 ...
## $ insulinNo : num 1 0 0 1 1 1 1 1 1 1 ...
## $ insulinSteady : num 0 0 1 0 0 0 0 0 0 0 ...
## $ changeCh : num 0 1 0 0 0 0 0 1 0 0 ...
## $ changeNo : num 1 0 1 1 1 1 1 0 1 1 ...
## $ diabetesMedNo : num 1 0 0 0 1 0 0 0 1 0 ...
## $ diabetesMedYes : num 0 1 1 1 0 1 1 1 0 1 ...
## $ readmitted : chr "YES" "YES" "NO" "NO" ...
#Test & Train Split
library(caTools)
set.seed(123)
split<-sample.split(diabetic_largedata$readmitted,SplitRatio=0.8)
train_diabetic_largedata<-subset(diabetic_largedata,split==T)
test_diabetic_largedata<-subset(diabetic_largedata,split==F)
sum(nrow(train_diabetic_largedata),nrow(test_diabetic_largedata))
## [1] 289
#Feature Scaling
library(caret)
normParam <- preProcess(train_diabetic_largedata, method = c("center", "scale"))
train_diabetic_largedata <- predict(normParam, train_diabetic_largedata)
test_diabetic_largedata<- predict(normParam,test_diabetic_largedata)
head(train_diabetic_largedata)
## time_in_hospital num_lab_procedures num_procedures num_medications
## 163 -0.1505286 -1.2039769 0.1168388 -1.1571953
## 594 -1.1346449 -0.2232058 -0.6769776 -1.2905846
## 697 1.8177039 0.4773450 0.1168388 0.7102546
## 772 2.8018202 -1.4841972 -0.6769776 -0.4902489
## 1281 -0.8066061 0.8276204 -0.6769776 -0.7570275
## 1756 -1.1346449 -1.4841972 -0.6769776 -0.2234704
## number_inpatient number_diagnoses diag_circ diag_resp diag_dig
## 163 -0.5275243 -0.6526203 0.9188738 -0.6647022 -0.3705859
## 594 -0.5275243 -0.6526203 -1.0835776 -0.6647022 -0.3705859
## 697 -0.5275243 -0.6526203 -1.0835776 -0.6647022 -0.3705859
## 772 -0.5275243 -1.9867378 -1.0835776 -0.6647022 2.6867480
## 1281 -0.5275243 -0.6526203 -1.0835776 -0.6647022 -0.3705859
## 1756 -0.5275243 -0.6526203 -1.0835776 -0.6647022 -0.3705859
## diag_diab diag_inj diag_geni diag_other raceAfricanAmerican
## 163 -1.2956649 -0.281239 -0.4357706 0.9029688 -0.4073637
## 594 0.7684633 -0.281239 -0.4357706 0.9029688 -0.4073637
## 697 0.7684633 3.540302 -0.4357706 -1.1026638 -0.4073637
## 772 0.7684633 -0.281239 -0.4357706 0.9029688 -0.4073637
## 1281 -1.2956649 -0.281239 -0.4357706 0.9029688 -0.4073637
## 1756 0.7684633 3.540302 -0.4357706 0.9029688 -0.4073637
## raceCaucasian raceHispanic genderFemale genderMale agethird ageforth
## 163 0.7124584 -0.3928473 -1.1624404 1.1624404 -0.2436696 -0.4001349
## 594 0.7124584 -0.3928473 0.8565351 -0.8565351 -0.2436696 -0.4001349
## 697 -1.3975146 -0.3928473 -1.1624404 1.1624404 -0.2436696 -0.4001349
## 772 0.7124584 -0.3928473 0.8565351 -0.8565351 4.0861518 -0.4001349
## 1281 0.7124584 -0.3928473 0.8565351 -0.8565351 -0.2436696 2.4883386
## 1756 0.7124584 -0.3928473 0.8565351 -0.8565351 -0.2436696 -0.4001349
## agefifth agesixth ageseventh ageeighth discharge_disposition_id1
## 163 -0.5043104 -0.5043104 -0.5244495 2.3613042 -1.3450677
## 594 1.9743217 -0.5043104 -0.5244495 -0.4216615 0.7402386
## 697 -0.5043104 -0.5043104 1.8985070 -0.4216615 -1.3450677
## 772 -0.5043104 -0.5043104 -0.5244495 -0.4216615 0.7402386
## 1281 -0.5043104 -0.5043104 -0.5244495 -0.4216615 0.7402386
## 1756 1.9743217 -0.5043104 -0.5244495 -0.4216615 0.7402386
## discharge_disposition_id2 discharge_disposition_id3
## 163 -0.2629521 2.7427810
## 594 -0.2629521 -0.3630151
## 697 -0.2629521 -0.3630151
## 772 -0.2629521 -0.3630151
## 1281 -0.2629521 -0.3630151
## 1756 -0.2629521 -0.3630151
## discharge_disposition_id6 admission_type_id1 admission_type_id6
## 163 -0.3397648 -0.4703831 0.5110378
## 594 -0.3397648 -0.4703831 0.5110378
## 697 2.9304714 -0.4703831 0.5110378
## 772 -0.3397648 -0.4703831 0.5110378
## 1281 -0.3397648 -0.4703831 0.5110378
## 1756 -0.3397648 -0.4703831 0.5110378
## max_glu_serum200 max_glu_serum300 max_glu_serumNorm A1Cresult7 A1Cresult8
## 163 1.7639365 -0.8414468 -0.7193656 -0.5511479 -1.1832846
## 594 -0.5644597 1.1832846 -0.7193656 -0.5511479 0.8414468
## 697 1.7639365 -0.8414468 -0.7193656 1.8065404 -1.1832846
## 772 -0.5644597 -0.8414468 1.3840959 1.8065404 -1.1832846
## 1281 -0.5644597 1.1832846 -0.7193656 1.8065404 -1.1832846
## 1756 -0.5644597 1.1832846 -0.7193656 1.8065404 -1.1832846
## A1CresultNorm admission_source_id1 admission_source_id7 insulinDown
## 163 2.1167238 -0.3155425 0.3318105 -0.2534499
## 594 -0.4703831 -0.3155425 0.3318105 -0.2534499
## 697 -0.4703831 -0.3155425 0.3318105 -0.2534499
## 772 -0.4703831 -0.3155425 -3.0007214 -0.2534499
## 1281 -0.4703831 -0.3155425 0.3318105 -0.2534499
## 1756 -0.4703831 -0.3155425 0.3318105 -0.2534499
## insulinNo insulinSteady changeCh changeNo diabetesMedNo
## 163 0.6243966 -0.4497173 -0.6177172 0.6177172 1.1521985
## 594 -1.5946130 2.2139930 -0.6177172 0.6177172 -0.8641489
## 697 0.6243966 -0.4497173 -0.6177172 0.6177172 -0.8641489
## 772 0.6243966 -0.4497173 -0.6177172 0.6177172 1.1521985
## 1281 0.6243966 -0.4497173 1.6118558 -1.6118558 -0.8641489
## 1756 0.6243966 -0.4497173 -0.6177172 0.6177172 1.1521985
## diabetesMedYes readmitted
## 163 -1.1521985 YES
## 594 0.8641489 NO
## 697 0.8641489 NO
## 772 -1.1521985 YES
## 1281 0.8641489 YES
## 1756 -1.1521985 YES
#install.packages("reshape2")
library(reshape2)
library("ggplot2")
plot_heatmap <- function(data, outcome_var) {
data_numeric <- data[, sapply(data, is.numeric)]
corr_matrix <- cor(data_numeric)
ggplot(data = melt(corr_matrix), aes(x = Var2, y = Var1, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "#f7fbff", high = "#08306b", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 1, size = 10, hjust = 1)) +
ggtitle(paste("Correlation Heatmap of", ncol(data_numeric), "Numeric Variables"))
}
plot_heatmap(train_diabetic_largedata, readmitted)
train_diabetic_largedata$readmitted<-as.factor(train_diabetic_largedata$readmitted)
test_diabetic_largedata$readmitted<-as.factor(test_diabetic_largedata$readmitted)
str(train_diabetic_largedata$readmitted)
## Factor w/ 2 levels "NO","YES": 2 1 1 2 2 2 2 2 1 2 ...
table(train_diabetic_largedata$readmitted)
##
## NO YES
## 93 138
cbind(colnames(diabetic_largedata))
## [,1]
## [1,] "time_in_hospital"
## [2,] "num_lab_procedures"
## [3,] "num_procedures"
## [4,] "num_medications"
## [5,] "number_inpatient"
## [6,] "number_diagnoses"
## [7,] "diag_circ"
## [8,] "diag_resp"
## [9,] "diag_dig"
## [10,] "diag_diab"
## [11,] "diag_inj"
## [12,] "diag_geni"
## [13,] "diag_other"
## [14,] "raceAfricanAmerican"
## [15,] "raceCaucasian"
## [16,] "raceHispanic"
## [17,] "genderFemale"
## [18,] "genderMale"
## [19,] "agethird"
## [20,] "ageforth"
## [21,] "agefifth"
## [22,] "agesixth"
## [23,] "ageseventh"
## [24,] "ageeighth"
## [25,] "discharge_disposition_id1"
## [26,] "discharge_disposition_id2"
## [27,] "discharge_disposition_id3"
## [28,] "discharge_disposition_id6"
## [29,] "admission_type_id1"
## [30,] "admission_type_id6"
## [31,] "max_glu_serum200"
## [32,] "max_glu_serum300"
## [33,] "max_glu_serumNorm"
## [34,] "A1Cresult7"
## [35,] "A1Cresult8"
## [36,] "A1CresultNorm"
## [37,] "admission_source_id1"
## [38,] "admission_source_id7"
## [39,] "insulinDown"
## [40,] "insulinNo"
## [41,] "insulinSteady"
## [42,] "changeCh"
## [43,] "changeNo"
## [44,] "diabetesMedNo"
## [45,] "diabetesMedYes"
## [46,] "readmitted"
Unsupervised Learning
##PCA
set.seed(123)
PCA <- prcomp(train_diabetic_largedata[,-46],
center = TRUE,
scale = TRUE)
library("factoextra")
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library("factoextra")
get_eigenvalue(PCA)
## eigenvalue
## Dim.1 4.6598325031985474709017580607905983925
## Dim.2 3.7805646840861397706134994223248213530
## Dim.3 2.9187897831060576336881240422371774912
## Dim.4 2.4503496574654932693704267876455560327
## Dim.5 2.3383766737560160997588809550506994128
## Dim.6 2.1227540207801922633734648115932941437
## Dim.7 1.9689523355885441002754987493972294033
## Dim.8 1.7768202520662772503357018649694509804
## Dim.9 1.6556170991362784139511177272652275860
## Dim.10 1.5769560456009974469537837649113498628
## Dim.11 1.5441741334508187755147901043528690934
## Dim.12 1.2948678670632081111335764944669790566
## Dim.13 1.2579494148627754501745812376611866057
## Dim.14 1.2280233243125018294250594408367760479
## Dim.15 1.1665266791117452349624272756045684218
## Dim.16 1.1079529810168642356416057737078517675
## Dim.17 1.0898202757579218413752641936298459768
## Dim.18 0.9939596665362285810019216114596929401
## Dim.19 0.9667172885340085120020603426382876933
## Dim.20 0.8810216142539648576104127641883678734
## Dim.21 0.8306493640521940280407875434320885688
## Dim.22 0.7782969625701783300542047072667628527
## Dim.23 0.7335514638375298934747092971520032734
## Dim.24 0.7082867909546097795470132041373290122
## Dim.25 0.6874449211291094874454188357049133629
## Dim.26 0.6554286266942238592392300233768764883
## Dim.27 0.6256500368707126336076385086926165968
## Dim.28 0.5727451562829668496590329596074298024
## Dim.29 0.5048664634834265640961348253767937422
## Dim.30 0.4744654246252194562671888888871762902
## Dim.31 0.4179866839151417101660968000942375511
## Dim.32 0.2896741405003985803112698249606182799
## Dim.33 0.2803808188302840398442583591531729326
## Dim.34 0.2511128013155019544910828699357807636
## Dim.35 0.1200714279244419480097150199071620591
## Dim.36 0.1023118327140242755923793538386235014
## Dim.37 0.0785489994747356201632015881841653027
## Dim.38 0.0414145218953088020685804337972513167
## Dim.39 0.0377707901635429832420953744076541625
## Dim.40 0.0293164730818278491952799669206797262
## Dim.41 0.0000000000000000000000000000009189899
## Dim.42 0.0000000000000000000000000000001620751
## Dim.43 0.0000000000000000000000000000001418502
## Dim.44 0.0000000000000000000000000000001237976
## Dim.45 0.0000000000000000000000000000001133062
## variance.percent cumulative.variance.percent
## Dim.1 10.3551833404412310102316041593439877033 10.35518
## Dim.2 8.4012548535247670855596879846416413784 18.75644
## Dim.3 6.4861995180134703886665192840155214071 25.24264
## Dim.4 5.4452214610344373824091235292144119740 30.68786
## Dim.5 5.1963926083467093519629997899755835533 35.88425
## Dim.6 4.7172311572893228515113150933757424355 40.60148
## Dim.7 4.3754496346412157237182327662594616413 44.97693
## Dim.8 3.9484894490361770458264345506904646754 48.92542
## Dim.9 3.6791491091917354339102530502714216709 52.60457
## Dim.10 3.5043467680022217791702132672071456909 56.10892
## Dim.11 3.4314980743351579661748473881743848324 59.54042
## Dim.12 2.8774841490293550094747843104414641857 62.41790
## Dim.13 2.7954431441395048985043558786856010556 65.21334
## Dim.14 2.7289407206944522599201263801660388708 67.94228
## Dim.15 2.5922815091372153339932538074208423495 70.53457
## Dim.16 2.4621177355930350394430661253863945603 72.99668
## Dim.17 2.4218228350176076446587103419005870819 75.41851
## Dim.18 2.2087992589694001033251424814807251096 77.62731
## Dim.19 2.1482606411866886908512697118567302823 79.77557
## Dim.20 1.9578258094532581257141146124922670424 81.73339
## Dim.21 1.8458874756715448750554742218810133636 83.57928
## Dim.22 1.7295488057115098179394863109337165952 85.30883
## Dim.23 1.6301143640834021564245404078974388540 86.93894
## Dim.24 1.5739706465658018785802596539724618196 88.51291
## Dim.25 1.5276553802869121678753572268760763109 90.04057
## Dim.26 1.4565080593204995373923793522408232093 91.49708
## Dim.27 1.3903334152682522706356849084841087461 92.88741
## Dim.28 1.2727670139621503064830676521523855627 94.16018
## Dim.29 1.1219254744076163632371390121988952160 95.28210
## Dim.30 1.0543676102782670866986336477566510439 96.33647
## Dim.31 0.9288592975892050462860538573295343667 97.26533
## Dim.32 0.6437203122231088814331201319873798639 97.90905
## Dim.33 0.6230684862895209397137818996270652860 98.53212
## Dim.34 0.5580284473677828982474125041335355490 99.09015
## Dim.35 0.2668253953876491557650751929031684995 99.35697
## Dim.36 0.2273596282533875967413194985056179576 99.58433
## Dim.37 0.1745533321660794057184062921805889346 99.75888
## Dim.38 0.0920322708784641341228649480399326421 99.85092
## Dim.39 0.0839350892523178654380799912360089365 99.93485
## Dim.40 0.0651477179596175320908102435168984812 100.00000
## Dim.41 0.0000000000000000000000000000020421998 100.00000
## Dim.42 0.0000000000000000000000000000003601669 100.00000
## Dim.43 0.0000000000000000000000000000003152227 100.00000
## Dim.44 0.0000000000000000000000000000002751057 100.00000
## Dim.45 0.0000000000000000000000000000002517916 100.00000
fviz_eig(PCA)
#Since 80% of information covered by the fist 20 columns so I'll go for pcaComp 20! > Because I've put dummyies in the pca I get this result
PCa<-preProcess(x = train_diabetic_largedata[-46],method="pca", pcaComp = 20)
PCa_train_diabetic_largedata<-predict(PCa, train_diabetic_largedata)
PCa_test_diabetic_largedata<-predict(PCa, test_diabetic_largedata)
head(PCa_train_diabetic_largedata)
## readmitted PC1 PC2 PC3 PC4 PC5
## 163 YES -2.8746675 0.7110215 1.4531339 1.2768709 0.6116170
## 594 NO 1.0047112 0.9623812 -1.4979231 -1.4719208 -1.7497251
## 697 NO -0.2396595 0.7115938 2.0164012 -0.5031467 0.4535689
## 772 YES -2.8020098 -0.1759601 -0.5212236 0.3838711 -2.6209025
## 1281 YES 1.0467211 1.2756512 -0.4347498 -0.1801328 -1.1873455
## 1756 YES -2.0003096 -0.9507301 -1.1226754 -1.1025223 -1.2133094
## PC6 PC7 PC8 PC9 PC10 PC11
## 163 -1.3751556 0.5647002 2.59165638 0.7118289 -1.69793155 -0.441464220
## 594 -0.9456402 -1.4400853 -0.32813288 -0.1395738 1.81199251 0.383967989
## 697 -1.3666440 0.6366796 -0.29735148 1.0305483 0.59569349 4.524702427
## 772 -2.7203187 -0.4189597 0.07812654 1.6150231 -0.02179667 -0.004352256
## 1281 0.2137869 -2.1545803 -0.22339069 1.3773906 -1.61826251 0.390024649
## 1756 -1.0791650 -2.3325741 -0.32572860 0.3063417 -0.82310510 2.929151963
## PC12 PC13 PC14 PC15 PC16 PC17
## 163 1.5370893 1.7462433 0.3644117 -1.2999932 -0.39089910 -0.7931275
## 594 0.5768248 0.4296597 1.0715284 -1.4379110 0.07618682 1.1314236
## 697 0.6886590 1.0427219 -0.6063008 0.8489401 -0.40155507 -0.7536306
## 772 -1.3871150 -3.9246498 1.1085770 0.8720854 1.73595813 1.5415211
## 1281 -0.6938338 -0.4167629 -1.2436965 0.8513527 -1.21060991 0.6669665
## 1756 -1.1137953 0.0199972 0.9094216 -0.1513736 -0.89725268 1.0215179
## PC18 PC19 PC20
## 163 -0.03895583 0.2828857 -0.2667563
## 594 -1.49641277 -0.3991312 1.5719328
## 697 1.94521252 -0.8059263 -2.8164233
## 772 1.29831147 -0.1996476 -1.4543981
## 1281 -0.83040544 -0.2947236 -0.4508713
## 1756 0.03379931 -1.0123100 -0.2092783
train_diabetic_largedata$readmitted <- as.factor(ifelse(train_diabetic_largedata$readmitted == "YES", "1", "0"))
test_diabetic_largedata$readmitted<-as.factor(ifelse(test_diabetic_largedata$readmitted=="YES","1","0"))
table(train_diabetic_largedata$readmitted)
##
## 0 1
## 93 138
table(test_diabetic_largedata$readmitted)
##
## 0 1
## 23 35
str(train_diabetic_largedata$readmitted)
## Factor w/ 2 levels "0","1": 2 1 1 2 2 2 2 2 1 2 ...
str(test_diabetic_largedata$readmitted)
## Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 1 1 ...
PCa_train_diabetic_largedata$readmitted <- as.factor(ifelse(PCa_train_diabetic_largedata$readmitted == "YES", "1", "0"))
PCa_test_diabetic_largedata$readmitted<-as.factor(ifelse(PCa_test_diabetic_largedata$readmitted=="YES","1","0"))
table(PCa_train_diabetic_largedata$readmitted)
##
## 0 1
## 93 138
table(PCa_test_diabetic_largedata$readmitted)
##
## 0 1
## 23 35
str(PCa_train_diabetic_largedata$readmitted)
## Factor w/ 2 levels "0","1": 2 1 1 2 2 2 2 2 1 2 ...
str(PCa_test_diabetic_largedata$readmitted)
## Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 1 1 ...
##K-means Clustering
# In this dataset we don't have technically any variables that may need unsupervised learning (like comment or unlabeled data) so I just practice the clustering on train data but I know that in clustering we don't need to split the test and train
wcss = vector()
for (i in 1:10){
model_kmeans = kmeans(train_diabetic_largedata, i)
wcss[i] = sum(model_kmeans$withinss)
}
plot(1:10,
wcss,
type = 'b',
main = paste('The Elbow Method'),
xlab = 'Number of clusters',
ylab = 'WCSS')
#based on elbow method two clusters make sence
library(cluster)
model_kmeans<- kmeans(train_diabetic_largedata, 3)
y_kmeans<- model_kmeans$cluster
kmeans_model_kmeans = kmeans(x = train_diabetic_largedata, centers = 3)
clusplot(train_diabetic_largedata,
y_kmeans,
lines = 0,
shade = TRUE,
color = TRUE,
labels = 2,
plotchar = FALSE,
span = TRUE,
main = 'Clusters of patients',
xlab = 'Xlab',
ylab = 'Ylab')
##Hierachical Clustering ON PCA data
hc <- hclust(d = dist(PCa_train_diabetic_largedata, method = 'euclidean'), method = 'ward.D')
plot(hc,
main = 'Dendrogram',
xlab = 'Customers',
ylab = 'Euclidean distances')
#install.packages("factoextra")
library(factoextra)
fviz_nbclust(PCa_train_diabetic_largedata, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette method")
#based on silhouette method 2 number of clusters is recommended
fviz_nbclust(PCa_train_diabetic_largedata[,-1], kmeans, nstart = 1, method = "gap_stat", nboot = 50)+
labs(subtitle = "Gap statistic method")
fviz_nbclust(PCa_train_diabetic_largedata[,-1], hcut, nstart = 1, method = "gap_stat", nboot = 50)+
labs(subtitle = "Gap statistic method")
#based on the graph optimal number of cluster are different from 2 3 and 8 ! but based on dendogram 3 make sence
#I'll go for 2 clusters
y_hc <-cutree(hc, 3)
library(cluster)
clusplot(train_diabetic_largedata,
y_hc,
lines = 0,
shade = TRUE,
color = TRUE,
labels= 2,
plotchar = FALSE,
span = TRUE,
main = 'Clusters of customers',
xlab = 'Xlab',
ylab = 'Ylab')
Suppervised learning
#MODELS :
# List of Models
model_list <-c()
accuracy_list<-c()
kappa_list<-c()
##Logestic regression :
LRmodel<-glm(readmitted~.,train_diabetic_largedata,family = "binomial")
summary(LRmodel)
##
## Call:
## glm(formula = readmitted ~ ., family = "binomial", data = train_diabetic_largedata)
##
## Coefficients: (5 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.685381 8.125013 0.084 0.93277
## time_in_hospital 0.501433 0.233624 2.146 0.03185 *
## num_lab_procedures -0.608617 0.201751 -3.017 0.00256 **
## num_procedures -0.139127 0.191683 -0.726 0.46795
## num_medications 0.419535 0.252480 1.662 0.09658 .
## number_inpatient 0.430593 0.249511 1.726 0.08439 .
## number_diagnoses 0.057040 0.279653 0.204 0.83838
## diag_circ 0.080231 0.212306 0.378 0.70550
## diag_resp 0.194186 0.208264 0.932 0.35113
## diag_dig 0.113123 0.188087 0.601 0.54755
## diag_diab -0.344623 0.201676 -1.709 0.08749 .
## diag_inj 0.044168 0.198233 0.223 0.82368
## diag_geni -0.036827 0.173402 -0.212 0.83181
## diag_other -0.079484 0.206787 -0.384 0.70070
## raceAfricanAmerican -0.527218 0.288128 -1.830 0.06728 .
## raceCaucasian -0.191791 0.338888 -0.566 0.57143
## raceHispanic -0.023366 0.281366 -0.083 0.93382
## genderFemale 0.190843 0.186570 1.023 0.30635
## genderMale NA NA NA NA
## agethird -0.628737 0.283992 -2.214 0.02683 *
## ageforth -0.574444 0.384938 -1.492 0.13562
## agefifth -0.812470 0.456424 -1.780 0.07506 .
## agesixth -0.734031 0.465585 -1.577 0.11489
## ageseventh -0.806224 0.461348 -1.748 0.08054 .
## ageeighth -0.786368 0.426354 -1.844 0.06512 .
## discharge_disposition_id1 0.237655 0.301199 0.789 0.43009
## discharge_disposition_id2 0.410209 0.227801 1.801 0.07174 .
## discharge_disposition_id3 0.388435 0.264536 1.468 0.14200
## discharge_disposition_id6 -0.159235 0.246322 -0.646 0.51799
## admission_type_id1 0.884149 0.559535 1.580 0.11407
## admission_type_id6 0.557252 0.621723 0.896 0.37009
## max_glu_serum200 0.055992 0.193346 0.290 0.77213
## max_glu_serum300 0.074233 0.258567 0.287 0.77404
## max_glu_serumNorm NA NA NA NA
## A1Cresult7 -0.017327 0.232840 -0.074 0.94068
## A1Cresult8 0.006712 0.262899 0.026 0.97963
## A1CresultNorm NA NA NA NA
## admission_source_id1 -4.142956 270.311284 -0.015 0.98777
## admission_source_id7 -4.713139 281.540180 -0.017 0.98664
## insulinDown 0.052218 0.245427 0.213 0.83151
## insulinNo 0.104542 0.417200 0.251 0.80214
## insulinSteady 0.195932 0.368150 0.532 0.59458
## changeCh 0.115412 0.265828 0.434 0.66417
## changeNo NA NA NA NA
## diabetesMedNo 0.021794 0.247601 0.088 0.92986
## diabetesMedYes NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 311.41 on 230 degrees of freedom
## Residual deviance: 239.74 on 190 degrees of freedom
## AIC: 321.74
##
## Number of Fisher Scoring iterations: 14
#Based un this model if we keep the variable's change, the y starts at 0.6 and for 1 increase in Time in hospital and number of lab procedures the probabality of outcome increase o.5 times and decrease 0.6 times respectively also 1 increase in age(third) decrease the outcome probabality by 60%
predictLRmodel<-predict(LRmodel,test_diabetic_largedata)
head(predictLRmodel)
## 461 824 962 1984 2309 5016
## -0.8503202 -2.5802732 -1.4662611 0.8109774 0.3940459 -2.9377491
class_predictLRmodel<-ifelse(predictLRmodel>0.5,1,0)
plot(LRmodel)
str(class_predictLRmodel)
## Named num [1:58] 0 0 0 1 0 0 1 1 1 0 ...
## - attr(*, "names")= chr [1:58] "461" "824" "962" "1984" ...
str(test_diabetic_largedata$readmitted)
## Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 1 1 ...
class(test_diabetic_largedata$readmitted)
## [1] "factor"
class(class_predictLRmodel)
## [1] "numeric"
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caret)
LRmodelconfusionmatric<-confusionMatrix(test_diabetic_largedata$readmitted,factor(class_predictLRmodel))
LRmodelconfusionmatric
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 15 8
## 1 14 21
##
## Accuracy : 0.6207
## 95% CI : (0.4837, 0.7449)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.04347
##
## Kappa : 0.2414
##
## Mcnemar's Test P-Value : 0.28642
##
## Sensitivity : 0.5172
## Specificity : 0.7241
## Pos Pred Value : 0.6522
## Neg Pred Value : 0.6000
## Prevalence : 0.5000
## Detection Rate : 0.2586
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.6207
##
## 'Positive' Class : 0
##
LRmodelroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),predictLRmodel,plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
LRmodelroc
##
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted, ordered = T), predictor = predictLRmodel, plot = T, print.auc = TRUE)
##
## Data: predictLRmodel in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.7043
model_list <-append(model_list,"LRmodel")
accuracy_list<-append(accuracy_list,LRmodelconfusionmatric$overall['Accuracy'])
kappa_list<-append(kappa_list,LRmodelconfusionmatric$overall['Kappa'])
#Logistic regression with just significant variables :
LRmodel2<-glm(readmitted~time_in_hospital+num_lab_procedures+agethird,train_diabetic_largedata,family = binomial)
summary(LRmodel2)
##
## Call:
## glm(formula = readmitted ~ time_in_hospital + num_lab_procedures +
## agethird, family = binomial, data = train_diabetic_largedata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4262 0.1402 3.039 0.002375 **
## time_in_hospital 0.5210 0.1583 3.291 0.000997 ***
## num_lab_procedures -0.3607 0.1504 -2.399 0.016451 *
## agethird -0.2365 0.1445 -1.637 0.101661
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 311.41 on 230 degrees of freedom
## Residual deviance: 294.66 on 227 degrees of freedom
## AIC: 302.66
##
## Number of Fisher Scoring iterations: 4
predictLRmodel2<-predict(LRmodel2,test_diabetic_largedata)
head(predictLRmodel2)
## 461 824 962 1984 2309 5016
## 1.0624947 -0.2841240 -0.1532191 0.8203332 0.4546378 -0.3732674
class_predictLRdimodel2<-ifelse(predictLRmodel2>0.5,1,0)
LRmodelconfusionmatric2<-confusionMatrix(test_diabetic_largedata$readmitted,factor(class_predictLRdimodel2))
LRmodelconfusionmatric2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19 4
## 1 16 19
##
## Accuracy : 0.6552
## 95% CI : (0.5188, 0.7751)
## No Information Rate : 0.6034
## P-Value [Acc > NIR] : 0.25292
##
## Kappa : 0.3387
##
## Mcnemar's Test P-Value : 0.01391
##
## Sensitivity : 0.5429
## Specificity : 0.8261
## Pos Pred Value : 0.8261
## Neg Pred Value : 0.5429
## Prevalence : 0.6034
## Detection Rate : 0.3276
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.6845
##
## 'Positive' Class : 0
##
LRmodel2roc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),predictLRmodel2,plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
LRmodel2roc
##
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted, ordered = T), predictor = predictLRmodel2, plot = T, print.auc = TRUE)
##
## Data: predictLRmodel2 in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.718
model_list <-append(model_list,"LRmodel2")
accuracy_list<-append(accuracy_list,LRmodelconfusionmatric2$overall['Accuracy'])
kappa_list<-append(kappa_list,LRmodelconfusionmatric2$overall['Kappa'])
#logestic regretion with the PCA data set
PCALRmodel3<-glm(readmitted~.,PCa_train_diabetic_largedata,family = "binomial")
summary(PCALRmodel3)
##
## Call:
## glm(formula = readmitted ~ ., family = "binomial", data = PCa_train_diabetic_largedata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.52383 0.15565 3.365 0.000764 ***
## PC1 0.12015 0.07021 1.711 0.087032 .
## PC2 -0.15412 0.08192 -1.881 0.059928 .
## PC3 0.31894 0.08963 3.558 0.000373 ***
## PC4 -0.08743 0.09650 -0.906 0.364932
## PC5 -0.03482 0.09824 -0.354 0.723006
## PC6 0.33425 0.11115 3.007 0.002635 **
## PC7 0.07124 0.11015 0.647 0.517781
## PC8 -0.02535 0.11240 -0.226 0.821588
## PC9 0.03506 0.11205 0.313 0.754345
## PC10 -0.29494 0.12758 -2.312 0.020790 *
## PC11 -0.10447 0.12222 -0.855 0.392675
## PC12 0.11079 0.13021 0.851 0.394822
## PC13 -0.30083 0.13580 -2.215 0.026748 *
## PC14 0.21961 0.14505 1.514 0.130018
## PC15 0.29881 0.14221 2.101 0.035623 *
## PC16 -0.04865 0.14162 -0.344 0.731203
## PC17 0.04412 0.13806 0.320 0.749271
## PC18 -0.02303 0.14577 -0.158 0.874464
## PC19 -0.04806 0.14882 -0.323 0.746746
## PC20 0.27985 0.16594 1.686 0.091712 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 311.41 on 230 degrees of freedom
## Residual deviance: 265.02 on 210 degrees of freedom
## AIC: 307.02
##
## Number of Fisher Scoring iterations: 4
predictLRmodel3<-predict(PCALRmodel3,PCa_test_diabetic_largedata)
head(predictLRmodel3)
## 461 824 962 1984 2309 5016
## -0.05863418 -0.71263723 -1.19628013 0.51373357 -0.29253418 -1.36470207
class_predictLRmodel3<-ifelse(predictLRmodel3>0.5,1,0)
head(class_predictLRmodel3)
## 461 824 962 1984 2309 5016
## 0 0 0 1 0 0
table(PCa_test_diabetic_largedata$readmitted)
##
## 0 1
## 23 35
PCALRmodelconfusionmatric<-confusionMatrix(factor(PCa_test_diabetic_largedata$readmitted,ordered=T),factor(class_predictLRmodel3,ordered=T))
PCALRmodelconfusionmatric
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16 7
## 1 15 20
##
## Accuracy : 0.6207
## 95% CI : (0.4837, 0.7449)
## No Information Rate : 0.5345
## P-Value [Acc > NIR] : 0.1177
##
## Kappa : 0.2521
##
## Mcnemar's Test P-Value : 0.1356
##
## Sensitivity : 0.5161
## Specificity : 0.7407
## Pos Pred Value : 0.6957
## Neg Pred Value : 0.5714
## Prevalence : 0.5345
## Detection Rate : 0.2759
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.6284
##
## 'Positive' Class : 0
##
LRmodel3roc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),predictLRmodel3,plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
LRmodel3roc
##
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted, ordered = T), predictor = predictLRmodel3, plot = T, print.auc = TRUE)
##
## Data: predictLRmodel3 in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.6522
model_list <-append(model_list,"PCALRmodel3")
accuracy_list<-append(accuracy_list,PCALRmodelconfusionmatric$overall['Accuracy'])
kappa_list<-append(kappa_list,PCALRmodelconfusionmatric$overall['Kappa'])
##Elastic Net
library(caret)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
ELmodel<-cv.glmnet(as.matrix(train_diabetic_largedata[,-46]), train_diabetic_largedata$readmitted, family = "binomial", alpha = 0.5)
ELpredict<-predict(ELmodel,as.matrix(test_diabetic_largedata[,-46]),type="response")
ELpredict_class<-ifelse(ELpredict>0.5,1,0)
mean(ELpredict_class==test_diabetic_largedata$readmitted)
## [1] 0.6034483
ELmodelroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),ELpredict,plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
PCaELmodel<-cv.glmnet(as.matrix(PCa_train_diabetic_largedata[,-1]), PCa_train_diabetic_largedata$readmitted, family = "binomial", alpha = 0.5)
plot(PCaELmodel)
PCaELpredict<-predict(PCaELmodel,as.matrix(PCa_test_diabetic_largedata[,-1]))
PCaELpredict_class<-ifelse(PCaELpredict>0.5,1,0)
table(PCa_test_diabetic_largedata$readmitted,PCaELpredict_class)
## PCaELpredict_class
## 0 1
## 0 19 4
## 1 23 12
mean(PCaELpredict_class==PCa_test_diabetic_largedata$readmitted)
## [1] 0.5344828
roc(PCa_test_diabetic_largedata$readmitted,PCaELpredict,plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = PCa_test_diabetic_largedata$readmitted, predictor = PCaELpredict, plot = T, print.auc = TRUE)
##
## Data: PCaELpredict in 23 controls (PCa_test_diabetic_largedata$readmitted 0) < 35 cases (PCa_test_diabetic_largedata$readmitted 1).
## Area under the curve: 0.6522
##KNN:
library(class)
vec = c()
k_vec = c()
for (k in 1:50){
predictKNN= knn(train = train_diabetic_largedata[, -46],test = test_diabetic_largedata[, -46],cl = train_diabetic_largedata$readmitted,k = k)
error = mean(predictKNN != test_diabetic_largedata$readmitted)
k_vec = c(k_vec, k)
vec = c(vec, error)}
dataframeerror<-data.frame(k_vec,vec)
min_row <- subset(dataframeerror, vec == min(vec))
ggplot(dataframeerror,aes(x=k_vec,y=vec))+geom_line(color="red")+
geom_hline(yintercept = min(dataframeerror$vec), linetype = "dashed") +annotate("text", x = min_row$k_vec, y = min_row$vec, label = min_row$k_vec, vjust = -1)+geom_point(data = min_row, aes(x = k_vec, y = vec), color = "blue", size = 3)
#k=17
predictKNN<-knn(train = train_diabetic_largedata[, -46],test = test_diabetic_largedata[, -46],cl = train_diabetic_largedata$readmitted,k = 17)
head(predictKNN)
## [1] 1 1 0 1 1 1
## Levels: 0 1
knnconfusionmatrix<-confusionMatrix(factor((test_diabetic_largedata$readmitted),ordered=T),factor(predictKNN,ordered=T))
knnconfusionmatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 17 6
## 1 7 28
##
## Accuracy : 0.7759
## 95% CI : (0.6473, 0.8749)
## No Information Rate : 0.5862
## P-Value [Acc > NIR] : 0.001934
##
## Kappa : 0.5351
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.7083
## Specificity : 0.8235
## Pos Pred Value : 0.7391
## Neg Pred Value : 0.8000
## Prevalence : 0.4138
## Detection Rate : 0.2931
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.7659
##
## 'Positive' Class : 0
##
predictKNN.roc<-roc(test_diabetic_largedata$readmitted,factor(predictKNN,ordered=T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
predictKNN.roc
##
## Call:
## roc.default(response = test_diabetic_largedata$readmitted, predictor = factor(predictKNN, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(predictKNN, ordered = T) in 23 controls (test_diabetic_largedata$readmitted 0) < 35 cases (test_diabetic_largedata$readmitted 1).
## Area under the curve: 0.7696
model_list <-append(model_list,"predictKNN")
accuracy_list<-append(accuracy_list,knnconfusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,knnconfusionmatrix$overall['Kappa'])
#knn with PCA data
vec2 = c()
k_vec2 = c()
for (k in 1:50){
predictKNN= knn(train = PCa_train_diabetic_largedata[, -1],test = PCa_test_diabetic_largedata[, -1],cl = PCa_train_diabetic_largedata$readmitted,k = k)
error2 = mean(predictKNN != PCa_test_diabetic_largedata$readmitted)
k_vec2 = c(k_vec2, k)
vec2 = c(vec2, error2)}
dataframeerror2<-data.frame(k_vec2,vec2)
min_row2 <- subset(dataframeerror2, vec2 == min(vec2))
ggplot(dataframeerror2,aes(x=k_vec2,y=vec2))+geom_line(color="red")+
geom_hline(yintercept = min(dataframeerror2$vec2), linetype = "dashed") +annotate("text", x = min_row2$k_vec2, y = min_row2$vec2, label = min_row2$k_vec2, vjust = -1)+geom_point(data = min_row2, aes(x = k_vec2, y = vec2), color = "blue", size = 3)
#k=24
PCApredictKNN<-knn(train = PCa_train_diabetic_largedata[, -1],test = PCa_test_diabetic_largedata[, -1],cl = PCa_train_diabetic_largedata$readmitted,k = 33)
head(PCApredictKNN)
## [1] 1 1 1 1 0 0
## Levels: 0 1
PCaKNNconfisionmatrix<-confusionMatrix(factor((PCa_test_diabetic_largedata$readmitted),ordered=T),factor(PCApredictKNN,ordered=T))
PCaKNNconfisionmatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 14 9
## 1 6 29
##
## Accuracy : 0.7414
## 95% CI : (0.6096, 0.8474)
## No Information Rate : 0.6552
## P-Value [Acc > NIR] : 0.1052
##
## Kappa : 0.4473
##
## Mcnemar's Test P-Value : 0.6056
##
## Sensitivity : 0.7000
## Specificity : 0.7632
## Pos Pred Value : 0.6087
## Neg Pred Value : 0.8286
## Prevalence : 0.3448
## Detection Rate : 0.2414
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.7316
##
## 'Positive' Class : 0
##
PCApredictKNN.roc<-roc(PCa_test_diabetic_largedata$readmitted,factor(PCApredictKNN,ordered=T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
PCApredictKNN.roc
##
## Call:
## roc.default(response = PCa_test_diabetic_largedata$readmitted, predictor = factor(PCApredictKNN, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(PCApredictKNN, ordered = T) in 23 controls (PCa_test_diabetic_largedata$readmitted 0) < 35 cases (PCa_test_diabetic_largedata$readmitted 1).
## Area under the curve: 0.7186
model_list <-append(model_list,"PCApredictKNN")
accuracy_list<-append(accuracy_list,PCaKNNconfisionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,PCaKNNconfisionmatrix$overall['Kappa'])
##SVM model :
#svm radial
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:lessR':
##
## kurtosis
tune.svm.largediabet <- tune(svm,train.x=train_diabetic_largedata[, -46],train.y=train_diabetic_largedata[, 46],kernel='radial',ranges=list(cost=10^(-1:2), gamma=c(0.25,.5,1,2)))
tune.svm.largediabet
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 0.1 0.25
##
## - best performance: 0.4030797
SVMmodel1<-svm(formula = readmitted ~ .,data = train_diabetic_largedata,kernel = 'radial',type="C-classification",cost=0.1,gamma=0.25)
predictSVMmodel1<-predict(SVMmodel1,test_diabetic_largedata)
SVMmodel1.roc<-roc(test_diabetic_largedata$readmitted,factor(predictSVMmodel1,ordered=T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
SVMmodel1.roc
##
## Call:
## roc.default(response = test_diabetic_largedata$readmitted, predictor = factor(predictSVMmodel1, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(predictSVMmodel1, ordered = T) in 23 controls (test_diabetic_largedata$readmitted 0) < 35 cases (test_diabetic_largedata$readmitted 1).
## Area under the curve: 0.5
table(test_diabetic_largedata$readmitted,predictSVMmodel1)
## predictSVMmodel1
## 0 1
## 0 0 23
## 1 0 35
#all the predictions are 1 seems the SVM model overfitted the train ! I'll try cross validation later
#SVM for PCA data
tune.svm.largediabet2 <- tune(svm,train.x=PCa_train_diabetic_largedata[, -1],train.y=PCa_train_diabetic_largedata[, 1],kernel='radial',ranges=list(cost=10^(-1:2), gamma=c(0.25,.5,1,2)))
tune.svm.largediabet2
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1 0.25
##
## - best performance: 0.3856884
PCASVMmodel2<-svm(formula = readmitted ~ .,data = PCa_train_diabetic_largedata,kernel = 'radial',type="C-classification",cost=10,gamma=0.5)
predictSVMmodel2<-predict(PCASVMmodel2,PCa_test_diabetic_largedata)
table(factor(PCa_test_diabetic_largedata$readmitted),factor(predictSVMmodel2,ordered=T))
##
## 1
## 0 23
## 1 35
SVMmodel2.roc<-roc(PCa_test_diabetic_largedata$readmitted,factor(predictSVMmodel2,ordered=T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
SVMmodel2.roc
##
## Call:
## roc.default(response = PCa_test_diabetic_largedata$readmitted, predictor = factor(predictSVMmodel2, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(predictSVMmodel2, ordered = T) in 23 controls (PCa_test_diabetic_largedata$readmitted 0) < 35 cases (PCa_test_diabetic_largedata$readmitted 1).
## Area under the curve: 0.5
my SVM is working as good as flip a coin !!!!!
##Decession Tree
library(rpart)
DTmodel1<-rpart(readmitted ~ ., method='class',data = train_diabetic_largedata)
plot(DTmodel1, uniform=TRUE, main="Main tree")
text(DTmodel1, use.n=TRUE, all=TRUE)
DTpredict1<-predict(DTmodel1,test_diabetic_largedata[-46])
class_DTpredict1<-ifelse(DTpredict1[,"1"]>0.5,1,0)
DTconfusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(class_DTpredict1,ordered = T))
DTconfusionmatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7 16
## 1 6 29
##
## Accuracy : 0.6207
## 95% CI : (0.4837, 0.7449)
## No Information Rate : 0.7759
## P-Value [Acc > NIR] : 0.99764
##
## Kappa : 0.1436
##
## Mcnemar's Test P-Value : 0.05501
##
## Sensitivity : 0.5385
## Specificity : 0.6444
## Pos Pred Value : 0.3043
## Neg Pred Value : 0.8286
## Prevalence : 0.2241
## Detection Rate : 0.1207
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.5915
##
## 'Positive' Class : 0
##
DTpredict1.roc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(class_DTpredict1,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
DTpredict1.roc
##
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted, ordered = T), predictor = factor(class_DTpredict1, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(class_DTpredict1, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.5665
model_list <-append(model_list,"DTmodel1")
accuracy_list<-append(accuracy_list,DTconfusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,DTconfusionmatrix$overall['Kappa'])
#Decision Tree in PCA data
DTmodel2<-rpart(readmitted ~ ., method='class',data = PCa_train_diabetic_largedata)
plot(DTmodel2, uniform=TRUE, main="Main tree")
text(DTmodel2, use.n=TRUE, all=TRUE)
DTpredict2<-predict(DTmodel2,PCa_test_diabetic_largedata[-1])
class_DTpredict2<-ifelse(DTpredict2[,"1"]>0.5,1,0)
confusionMatrix(factor(PCa_test_diabetic_largedata$readmitted,ordered = T),factor(class_DTpredict2,ordered = T))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9 14
## 1 12 23
##
## Accuracy : 0.5517
## 95% CI : (0.4154, 0.6826)
## No Information Rate : 0.6379
## P-Value [Acc > NIR] : 0.9319
##
## Kappa : 0.0492
##
## Mcnemar's Test P-Value : 0.8445
##
## Sensitivity : 0.4286
## Specificity : 0.6216
## Pos Pred Value : 0.3913
## Neg Pred Value : 0.6571
## Prevalence : 0.3621
## Detection Rate : 0.1552
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.5251
##
## 'Positive' Class : 0
##
DTpredict2.roc<-roc(factor(PCa_test_diabetic_largedata$readmitted,ordered=T),factor(class_DTpredict2,ordered = T),plot=T,print.auc=T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
DTpredict2.roc
##
## Call:
## roc.default(response = factor(PCa_test_diabetic_largedata$readmitted, ordered = T), predictor = factor(class_DTpredict2, ordered = T), plot = T, print.auc = T)
##
## Data: factor(class_DTpredict2, ordered = T) in 23 controls (factor(PCa_test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(PCa_test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.5242
##Random Forest
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
RFmodel1<-randomForest(readmitted ~ ., method='class',data =train_diabetic_largedata)
importance(RFmodel1)
## MeanDecreaseGini
## time_in_hospital 8.1818596
## num_lab_procedures 11.6281479
## num_procedures 3.4827573
## num_medications 13.0492024
## number_inpatient 5.9557078
## number_diagnoses 3.7367889
## diag_circ 2.0734962
## diag_resp 2.2239198
## diag_dig 1.0200692
## diag_diab 2.3612038
## diag_inj 1.2613006
## diag_geni 1.3350263
## diag_other 2.4821827
## raceAfricanAmerican 2.1710502
## raceCaucasian 2.2610999
## raceHispanic 1.1082423
## genderFemale 2.3348926
## genderMale 2.6737351
## agethird 1.2403264
## ageforth 1.0953051
## agefifth 1.5149810
## agesixth 1.4820269
## ageseventh 1.6191520
## ageeighth 1.4251049
## discharge_disposition_id1 2.1203363
## discharge_disposition_id2 1.0409082
## discharge_disposition_id3 1.5798936
## discharge_disposition_id6 1.6153082
## admission_type_id1 1.2399576
## admission_type_id6 1.2141682
## max_glu_serum200 1.7696785
## max_glu_serum300 1.3349874
## max_glu_serumNorm 1.6756513
## A1Cresult7 1.4890854
## A1Cresult8 1.7376547
## A1CresultNorm 1.7568771
## admission_source_id1 1.0217697
## admission_source_id7 1.1240186
## insulinDown 0.7181179
## insulinNo 1.4234548
## insulinSteady 1.2803110
## changeCh 1.3610448
## changeNo 1.5414399
## diabetesMedNo 1.7117182
## diabetesMedYes 1.6760362
#based on this model number of medications has the most impact on the readmitted or not readmitted patient, after that time in hospital and number of lab_procedures had impact on readmitted or not readmitted patients
RFpredict1<-predict(RFmodel1,test_diabetic_largedata)
#class_RFpredict1<-ifelse(RFpredict1>0.5,1,0)
RFconfusionmatrix<-confusionMatrix(test_diabetic_largedata$readmitted,factor(RFpredict1,ordered = T))
RFconfusionmatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 14 9
## 1 7 28
##
## Accuracy : 0.7241
## 95% CI : (0.591, 0.8334)
## No Information Rate : 0.6379
## P-Value [Acc > NIR] : 0.1080
##
## Kappa : 0.4149
##
## Mcnemar's Test P-Value : 0.8026
##
## Sensitivity : 0.6667
## Specificity : 0.7568
## Pos Pred Value : 0.6087
## Neg Pred Value : 0.8000
## Prevalence : 0.3621
## Detection Rate : 0.2414
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.7117
##
## 'Positive' Class : 0
##
RFpredict1.roc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(RFpredict1,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
RFpredict1.roc
##
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted, ordered = T), predictor = factor(RFpredict1, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(RFpredict1, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.7043
model_list <-append(model_list,"RFmodel1")
accuracy_list<-append(accuracy_list,RFconfusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,RFconfusionmatrix$overall['Kappa'])
#Rndom Forest with PCA
PCARFmodel2<-randomForest(readmitted ~ ., method='class',data =PCa_train_diabetic_largedata)
importance(PCARFmodel2)
## MeanDecreaseGini
## PC1 5.754521
## PC2 5.820856
## PC3 11.285097
## PC4 4.868472
## PC5 4.822118
## PC6 6.354644
## PC7 5.300672
## PC8 4.262475
## PC9 4.588260
## PC10 6.441359
## PC11 5.251960
## PC12 4.174113
## PC13 6.039580
## PC14 4.834762
## PC15 5.107902
## PC16 4.812358
## PC17 5.614285
## PC18 4.525770
## PC19 5.043179
## PC20 5.605877
PCaRFpredict2<-predict(PCARFmodel2,PCa_test_diabetic_largedata)
#PCaRFconfusionmatrix<-confusionMatrix(factor(PCa_test_diabetic_largedata$readmitted,ordered=T),factor(PCaRFpredict2,ordered = T))
#PCaRFconfusionmatrix
RFpredict2.roc<-roc(factor(PCa_test_diabetic_largedata$readmitted,ordered=T),factor(PCaRFpredict2,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
RFpredict2.roc
##
## Call:
## roc.default(response = factor(PCa_test_diabetic_largedata$readmitted, ordered = T), predictor = factor(PCaRFpredict2, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(PCaRFpredict2, ordered = T) in 23 controls (factor(PCa_test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(PCa_test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.7329
#model_list <-append(model_list,"PCARFDTmodel2")
#accuracy_list<-append(accuracy_list,PCaRFconfusionmatrix$overall['Accuracy'])
#kappa_list<-append(kappa_list,PCaRFconfusionmatrix$overall['Kappa'])
head(test_diabetic_largedata)
## time_in_hospital num_lab_procedures num_procedures num_medications
## 461 1.4896652 0.5474001 0.1168388 0.5768653
## 824 0.5055489 2.8592177 1.7044716 0.1766975
## 962 -1.1346449 0.1270696 -0.6769776 -1.5573632
## 1984 -0.4785674 -1.6243074 0.1168388 -0.8904168
## 2309 0.1775101 0.3372348 -0.6769776 0.3100868
## 5016 -0.1505286 2.1586669 0.9106552 -0.4902489
## number_inpatient number_diagnoses diag_circ diag_resp diag_dig
## 461 -0.5275243 -0.6526203 -1.0835776 -0.6647022 -0.3705859
## 824 -0.5275243 -0.6526203 0.9188738 -0.6647022 -0.3705859
## 962 -0.5275243 -1.9867378 0.9188738 -0.6647022 -0.3705859
## 1984 -0.5275243 -1.9867378 -1.0835776 1.4979204 -0.3705859
## 2309 0.2637622 -0.6526203 0.9188738 -0.6647022 -0.3705859
## 5016 -0.5275243 -0.6526203 0.9188738 -0.6647022 -0.3705859
## diag_diab diag_inj diag_geni diag_other raceAfricanAmerican
## 461 0.7684633 -0.281239 -0.4357706 0.9029688 2.4441821
## 824 0.7684633 -0.281239 -0.4357706 0.9029688 -0.4073637
## 962 0.7684633 -0.281239 -0.4357706 0.9029688 -0.4073637
## 1984 0.7684633 -0.281239 -0.4357706 -1.1026638 -0.4073637
## 2309 -1.2956649 -0.281239 -0.4357706 -1.1026638 -0.4073637
## 5016 0.7684633 -0.281239 2.2848512 -1.1026638 2.4441821
## raceCaucasian raceHispanic genderFemale genderMale agethird ageforth
## 461 -1.3975146 -0.3928473 0.8565351 -0.8565351 -0.2436696 -0.4001349
## 824 0.7124584 -0.3928473 -1.1624404 1.1624404 -0.2436696 -0.4001349
## 962 -1.3975146 2.5344987 0.8565351 -0.8565351 -0.2436696 -0.4001349
## 1984 0.7124584 -0.3928473 -1.1624404 1.1624404 -0.2436696 -0.4001349
## 2309 -1.3975146 2.5344987 0.8565351 -0.8565351 -0.2436696 -0.4001349
## 5016 -1.3975146 -0.3928473 0.8565351 -0.8565351 -0.2436696 -0.4001349
## agefifth agesixth ageseventh ageeighth discharge_disposition_id1
## 461 -0.5043104 -0.5043104 1.8985070 -0.4216615 0.7402386
## 824 -0.5043104 -0.5043104 -0.5244495 2.3613042 0.7402386
## 962 1.9743217 -0.5043104 -0.5244495 -0.4216615 0.7402386
## 1984 1.9743217 -0.5043104 -0.5244495 -0.4216615 -1.3450677
## 2309 -0.5043104 -0.5043104 1.8985070 -0.4216615 -1.3450677
## 5016 1.9743217 -0.5043104 -0.5244495 -0.4216615 0.7402386
## discharge_disposition_id2 discharge_disposition_id3
## 461 -0.2629521 -0.3630151
## 824 -0.2629521 -0.3630151
## 962 -0.2629521 -0.3630151
## 1984 -0.2629521 -0.3630151
## 2309 -0.2629521 -0.3630151
## 5016 -0.2629521 -0.3630151
## discharge_disposition_id6 admission_type_id1 admission_type_id6
## 461 -0.3397648 -0.4703831 0.5110378
## 824 -0.3397648 -0.4703831 0.5110378
## 962 -0.3397648 -0.4703831 0.5110378
## 1984 -0.3397648 -0.4703831 0.5110378
## 2309 2.9304714 -0.4703831 0.5110378
## 5016 -0.3397648 -0.4703831 0.5110378
## max_glu_serum200 max_glu_serum300 max_glu_serumNorm A1Cresult7 A1Cresult8
## 461 -0.5644597 1.1832846 -0.7193656 -0.5511479 0.8414468
## 824 -0.5644597 1.1832846 -0.7193656 1.8065404 -1.1832846
## 962 -0.5644597 -0.8414468 1.3840959 1.8065404 -1.1832846
## 1984 1.7639365 -0.8414468 -0.7193656 -0.5511479 0.8414468
## 2309 1.7639365 -0.8414468 -0.7193656 -0.5511479 -1.1832846
## 5016 -0.5644597 -0.8414468 1.3840959 -0.5511479 0.8414468
## A1CresultNorm admission_source_id1 admission_source_id7 insulinDown
## 461 -0.4703831 -0.3155425 0.3318105 -0.2534499
## 824 -0.4703831 -0.3155425 0.3318105 -0.2534499
## 962 -0.4703831 -0.3155425 0.3318105 -0.2534499
## 1984 -0.4703831 3.1554255 -3.0007214 -0.2534499
## 2309 2.1167238 -0.3155425 0.3318105 -0.2534499
## 5016 -0.4703831 -0.3155425 0.3318105 -0.2534499
## insulinNo insulinSteady changeCh changeNo diabetesMedNo
## 461 -1.5946130 -0.4497173 1.6118558 -1.6118558 -0.8641489
## 824 0.6243966 -0.4497173 -0.6177172 0.6177172 -0.8641489
## 962 0.6243966 -0.4497173 -0.6177172 0.6177172 -0.8641489
## 1984 0.6243966 -0.4497173 -0.6177172 0.6177172 -0.8641489
## 2309 0.6243966 -0.4497173 -0.6177172 0.6177172 1.1521985
## 5016 -1.5946130 2.2139930 -0.6177172 0.6177172 -0.8641489
## diabetesMedYes readmitted
## 461 0.8641489 1
## 824 0.8641489 1
## 962 0.8641489 1
## 1984 0.8641489 1
## 2309 -1.1521985 1
## 5016 0.8641489 1
test_diabetic_largedata$readmitted<-as.numeric(test_diabetic_largedata$readmitted)
train_diabetic_largedata$readmitted<-as.numeric(train_diabetic_largedata$readmitted)
train_diabetic_largedata$readmitted<-ifelse(train_diabetic_largedata$readmitted==1,0,1)
test_diabetic_largedata$readmitted<-ifelse(test_diabetic_largedata$readmitted==1,0,1)
str(test_diabetic_largedata$readmitted)
## num [1:58] 1 1 1 1 1 1 1 1 0 0 ...
##XGboost
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
XGboostclassifier <- xgboost(data = as.matrix(train_diabetic_largedata[-46]),
label = train_diabetic_largedata$readmitted,
nrounds = 10,
max_depth = 6,
eta = 0.3,
gamma = 0.5,
subsample = 0.8,
colsample_bytree = 0.8,
min_child_weight = 1)
## [1] train-rmse:0.443372
## [2] train-rmse:0.400061
## [3] train-rmse:0.364136
## [4] train-rmse:0.333161
## [5] train-rmse:0.316839
## [6] train-rmse:0.302918
## [7] train-rmse:0.293847
## [8] train-rmse:0.293799
## [9] train-rmse:0.293774
## [10] train-rmse:0.287219
XGboost_predict <- predict(XGboostclassifier, newdata = as.matrix(test_diabetic_largedata[-46]))
head(XGboost_predict)
## [1] 0.4178621 0.2111195 0.1282913 0.7103968 0.7638208 0.1316332
XGboost_predict<-ifelse(XGboost_predict>=0.5,1,0)
table(test_diabetic_largedata$readmitted,XGboost_predict)
## XGboost_predict
## 0 1
## 0 9 14
## 1 13 22
imp_matrix_XGboos<-xgb.importance(model=XGboostclassifier)
xgb.plot.importance(imp_matrix_XGboos)
xgboostconfusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered=T),factor(XGboost_predict,ordered = T))
xgboostconfusionmatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9 14
## 1 13 22
##
## Accuracy : 0.5345
## 95% CI : (0.3987, 0.6666)
## No Information Rate : 0.6207
## P-Value [Acc > NIR] : 0.9303
##
## Kappa : 0.02
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.4091
## Specificity : 0.6111
## Pos Pred Value : 0.3913
## Neg Pred Value : 0.6286
## Prevalence : 0.3793
## Detection Rate : 0.1552
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.5101
##
## 'Positive' Class : 0
##
xgboost.roc<-roc(factor(PCa_test_diabetic_largedata$readmitted,ordered=T),factor(XGboost_predict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
xgboost.roc
##
## Call:
## roc.default(response = factor(PCa_test_diabetic_largedata$readmitted, ordered = T), predictor = factor(XGboost_predict, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(XGboost_predict, ordered = T) in 23 controls (factor(PCa_test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(PCa_test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.5099
model_list <-append(model_list,"XGboostclassifier")
accuracy_list<-append(accuracy_list,xgboostconfusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,xgboostconfusionmatrix$overall['Kappa'])
PCa_test_diabetic_largedata$readmitted<-as.numeric(PCa_test_diabetic_largedata$readmitted)
PCa_train_diabetic_largedata$readmitted<-as.numeric(PCa_train_diabetic_largedata$readmitted)
PCa_train_diabetic_largedata$readmitted<-ifelse(PCa_train_diabetic_largedata$readmitted==1,0,1)
PCa_test_diabetic_largedata$readmitted<-ifelse(PCa_test_diabetic_largedata$readmitted==1,0,1)
#XGboost by PCA data
XGboostclassifier2<- xgboost(data = as.matrix(PCa_train_diabetic_largedata[-1]), label = PCa_train_diabetic_largedata$readmitted, nrounds = 10)
## [1] train-rmse:0.396027
## [2] train-rmse:0.320324
## [3] train-rmse:0.260482
## [4] train-rmse:0.219760
## [5] train-rmse:0.195537
## [6] train-rmse:0.174610
## [7] train-rmse:0.158761
## [8] train-rmse:0.139501
## [9] train-rmse:0.128810
## [10] train-rmse:0.112929
XGboost_predict2<- predict(XGboostclassifier2, newdata = as.matrix(PCa_test_diabetic_largedata[-1]))
head(XGboost_predict2)
## [1] 0.7526193 0.8681863 0.4298209 0.7602748 0.1396990 0.3438241
XGboost_predict2<-ifelse(XGboost_predict2>=0.5,1,0)
table(PCa_test_diabetic_largedata$readmitted,XGboost_predict2)
## XGboost_predict2
## 0 1
## 0 11 12
## 1 10 25
#confusionMatrix(factor(PCa_test_diabetic_largedata$readmitted,ordered=T),factor(XGboost_predict2,ordered = T))
imp_matrix_XGboos2<-xgb.importance(model=XGboostclassifier2)
xgb.plot.importance(imp_matrix_XGboos2)
###Cross Validation
table(test_diabetic_largedata$readmitted)
##
## 0 1
## 23 35
table(train_diabetic_largedata$readmitted)
##
## 0 1
## 93 138
train_diabetic_largedata$readmitted<-ifelse(train_diabetic_largedata$readmitted=="0","NO","YES")
test_diabetic_largedata$readmitted<-ifelse(test_diabetic_largedata$readmitted=="0","NO","YES")
# Define trainControl object
TRC <- trainControl(method = "cv",number = 5,classProbs = TRUE,summaryFunction = twoClassSummary)
##Glm Cross validation
# Train glm model with cross-validation
GLMmodelCV <- train(readmitted ~ .,data = train_diabetic_largedata,method = "glm",metric = "ROC",trControl = TRC)
print(GLMmodelCV)
## Generalized Linear Model
##
## 231 samples
## 45 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 185, 185, 184, 185, 185
## Resampling results:
##
## ROC Sens Spec
## 0.6325327 0.5590643 0.673545
GLMmodelCV$results
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.6325327 0.5590643 0.673545 0.02591945 0.09703022 0.06960111
GLMmodelCVpredict<-predict(GLMmodelCV,test_diabetic_largedata)
GLMmodelCVpredict
## [1] NO NO NO YES YES NO YES YES YES NO YES NO YES NO NO NO NO NO YES
## [20] YES YES NO YES NO YES YES YES NO YES NO YES YES YES NO NO NO YES NO
## [39] YES NO YES YES NO NO YES NO YES NO NO YES YES YES YES NO YES YES YES
## [58] YES
## Levels: NO YES
GLMmodelCVconfusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(GLMmodelCVpredict,ordered=T))
GLMmodelCVconfusionmatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 14 9
## YES 12 23
##
## Accuracy : 0.6379
## 95% CI : (0.5012, 0.7601)
## No Information Rate : 0.5517
## P-Value [Acc > NIR] : 0.1169
##
## Kappa : 0.26
##
## Mcnemar's Test P-Value : 0.6625
##
## Sensitivity : 0.5385
## Specificity : 0.7188
## Pos Pred Value : 0.6087
## Neg Pred Value : 0.6571
## Prevalence : 0.4483
## Detection Rate : 0.2414
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.6286
##
## 'Positive' Class : NO
##
GLMmodelCVroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(GLMmodelCVpredict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = NO, case = YES
## Setting direction: controls < cases
GLMmodelCVroc
##
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted, ordered = T), predictor = factor(GLMmodelCVpredict, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(GLMmodelCVpredict, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) NO) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) YES).
## Area under the curve: 0.6329
model_list <-append(model_list,"GLMmodelCV")
accuracy_list<-append(accuracy_list,GLMmodelCVconfusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,GLMmodelCVconfusionmatrix$overall['Kappa'])
##Random Forest Cross validation
RFmodelcv <- train(readmitted~., data = train_diabetic_largedata, method = "rf",metric = "ROC",trControl = TRC)
RFmodelcv
## Random Forest
##
## 231 samples
## 45 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 185, 185, 184, 185, 185
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.6399506 0.2941520 0.8558201
## 23 0.6609580 0.4438596 0.7915344
## 45 0.6559802 0.4228070 0.7550265
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 23.
RFmodelcvpredict<-predict(RFmodelcv,test_diabetic_largedata)
RFmodelcvpredict
## [1] YES NO NO YES YES NO YES YES YES NO YES NO YES YES NO NO NO NO YES
## [20] YES YES YES YES YES YES YES YES NO YES YES YES YES YES YES NO YES YES NO
## [39] YES YES YES YES NO NO YES NO YES NO NO YES YES NO YES YES YES YES YES
## [58] YES
## Levels: NO YES
RFmodelcvconfusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(RFmodelcvpredict,ordered=T))
RFmodelcvconfusionmatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 11 12
## YES 7 28
##
## Accuracy : 0.6724
## 95% CI : (0.5366, 0.7899)
## No Information Rate : 0.6897
## P-Value [Acc > NIR] : 0.6700
##
## Kappa : 0.289
##
## Mcnemar's Test P-Value : 0.3588
##
## Sensitivity : 0.6111
## Specificity : 0.7000
## Pos Pred Value : 0.4783
## Neg Pred Value : 0.8000
## Prevalence : 0.3103
## Detection Rate : 0.1897
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.6556
##
## 'Positive' Class : NO
##
RFMmodelCVroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(RFmodelcvpredict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = NO, case = YES
## Setting direction: controls < cases
RFMmodelCVroc
##
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted, ordered = T), predictor = factor(RFmodelcvpredict, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(RFmodelcvpredict, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) NO) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) YES).
## Area under the curve: 0.6391
model_list <-append(model_list,"RFmodelcv")
accuracy_list<-append(accuracy_list,RFmodelcvconfusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,RFmodelcvconfusionmatrix$overall['Kappa'])
##knn Cross validation
k_values <- seq(1, 20, by = 1)
KNNmodelCV <- train(
readmitted ~ .,
data = train_diabetic_largedata,
method = "knn",
metric = "ROC",
trControl = TRC,
tuneGrid = expand.grid(k = k_values)
)
KNNmodelCV
## k-Nearest Neighbors
##
## 231 samples
## 45 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 185, 185, 184, 185, 185
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 1 0.5235589 0.4836257 0.5634921
## 2 0.4878968 0.4304094 0.5211640
## 3 0.5252158 0.4730994 0.6216931
## 4 0.5505013 0.4502924 0.6216931
## 5 0.5530528 0.4619883 0.6076720
## 6 0.5648914 0.3976608 0.6148148
## 7 0.5776977 0.4520468 0.6148148
## 8 0.5761557 0.4526316 0.6582011
## 9 0.5786097 0.4614035 0.6513228
## 10 0.5725459 0.4087719 0.6150794
## 11 0.5530006 0.3654971 0.6150794
## 12 0.5516674 0.3748538 0.6074074
## 13 0.5700675 0.3760234 0.6296296
## 14 0.5762879 0.3748538 0.6447090
## 15 0.5734510 0.3760234 0.6444444
## 16 0.5828216 0.3766082 0.6801587
## 17 0.5804407 0.4087719 0.6804233
## 18 0.5970934 0.4087719 0.6656085
## 19 0.6058793 0.3877193 0.6878307
## 20 0.6138123 0.3994152 0.6952381
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 20.
KNNmodelcvpredict<-predict(KNNmodelCV,test_diabetic_largedata)
KNNpredictCVconfiusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(KNNmodelcvpredict,ordered=T))
KNNpredictCVconfiusionmatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 17 6
## YES 7 28
##
## Accuracy : 0.7759
## 95% CI : (0.6473, 0.8749)
## No Information Rate : 0.5862
## P-Value [Acc > NIR] : 0.001934
##
## Kappa : 0.5351
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.7083
## Specificity : 0.8235
## Pos Pred Value : 0.7391
## Neg Pred Value : 0.8000
## Prevalence : 0.4138
## Detection Rate : 0.2931
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.7659
##
## 'Positive' Class : NO
##
KNNpredictCVroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(KNNmodelcvpredict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = NO, case = YES
## Setting direction: controls < cases
KNNpredictCVroc
##
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted, ordered = T), predictor = factor(KNNmodelcvpredict, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(KNNmodelcvpredict, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) NO) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) YES).
## Area under the curve: 0.7696
model_list <-append(model_list,"KNNmodelCVF")
accuracy_list<-append(accuracy_list,KNNpredictCVconfiusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,KNNpredictCVconfiusionmatrix$overall['Kappa'])
##SVM Cross Validation
SVMmodelCV <- train(readmitted ~ ., data = train_diabetic_largedata, method = "svmRadial", tuneLength = 5, preProc = c("center", "scale"), metric = "ROC", trControl = TRC)
SVMmodelCV
## Support Vector Machines with Radial Basis Function Kernel
##
## 231 samples
## 45 predictor
## 2 classes: 'NO', 'YES'
##
## Pre-processing: centered (45), scaled (45)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 185, 185, 185, 185, 184
## Resampling results across tuning parameters:
##
## C ROC Sens Spec
## 0.25 0.6189223 0.32222222 0.7677249
## 0.50 0.6185324 0.35380117 0.8034392
## 1.00 0.5076998 0.14853801 0.8838624
## 2.00 0.5877471 0.24795322 0.8259259
## 4.00 0.5568713 0.05321637 0.9412698
##
## Tuning parameter 'sigma' was held constant at a value of 0.01274113
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01274113 and C = 0.25.
SVMmodelcvpredict<-predict(SVMmodelCV,test_diabetic_largedata)
SVMpredictCVconfiusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(SVMmodelcvpredict,ordered=T))
SVMpredictCVconfiusionmatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 9 14
## YES 5 30
##
## Accuracy : 0.6724
## 95% CI : (0.5366, 0.7899)
## No Information Rate : 0.7586
## P-Value [Acc > NIR] : 0.95038
##
## Kappa : 0.2663
##
## Mcnemar's Test P-Value : 0.06646
##
## Sensitivity : 0.6429
## Specificity : 0.6818
## Pos Pred Value : 0.3913
## Neg Pred Value : 0.8571
## Prevalence : 0.2414
## Detection Rate : 0.1552
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.6623
##
## 'Positive' Class : NO
##
SVMmodelCVroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(SVMmodelcvpredict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = NO, case = YES
## Setting direction: controls < cases
SVMmodelCVroc
##
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted, ordered = T), predictor = factor(SVMmodelcvpredict, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(SVMmodelcvpredict, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) NO) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) YES).
## Area under the curve: 0.6242
model_list <-append(model_list,"SVMmodelCV")
accuracy_list<-append(accuracy_list,SVMpredictCVconfiusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,SVMpredictCVconfiusionmatrix$overall['Kappa'])
#XGboost cross validation
XGmodelCV <- train(readmitted ~ .,data = train_diabetic_largedata,method = "xgbTree",metric = "ROC",trControl = TRC,tuneGrid=expand.grid(nrounds = 10,
max_depth = 6,
eta = 0.3,
gamma = 0.5,
subsample = 0.8,
colsample_bytree = 0.8,
min_child_weight = 1))
XGmodelCV$bestTune
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 1 10 6 0.3 0.5 0.8 1 0.8
XGmodelcvpredict<-predict(XGmodelCV,test_diabetic_largedata)
XGpredictCVconfiusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(XGmodelcvpredict,ordered=T))
XGpredictCVconfiusionmatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 12 11
## YES 12 23
##
## Accuracy : 0.6034
## 95% CI : (0.4664, 0.7295)
## No Information Rate : 0.5862
## P-Value [Acc > NIR] : 0.4501
##
## Kappa : 0.1776
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.5000
## Specificity : 0.6765
## Pos Pred Value : 0.5217
## Neg Pred Value : 0.6571
## Prevalence : 0.4138
## Detection Rate : 0.2069
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.5882
##
## 'Positive' Class : NO
##
XGmodelCVroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(XGmodelcvpredict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = NO, case = YES
## Setting direction: controls < cases
XGmodelCVroc
##
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted, ordered = T), predictor = factor(XGmodelcvpredict, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(XGmodelcvpredict, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) NO) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) YES).
## Area under the curve: 0.5894
model_list <-append(model_list,"XGmodelCV")
accuracy_list<-append(accuracy_list,XGpredictCVconfiusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,XGpredictCVconfiusionmatrix$overall['Kappa'])
#Elastic Net Cross Validation
ENmodelcv <- train(readmitted ~ .,data = train_diabetic_largedata,method = "glmnet", metric = "ROC", trControl = TRC)
ENmodelcv
## glmnet
##
## 231 samples
## 45 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 186, 185, 184, 184, 185
## Resampling results across tuning parameters:
##
## alpha lambda ROC Sens Spec
## 0.10 0.0002334386 0.6079644 0.4830409 0.6812169
## 0.10 0.0023343864 0.6188813 0.4836257 0.6812169
## 0.10 0.0233438645 0.6325699 0.4497076 0.7103175
## 0.55 0.0002334386 0.6091773 0.4941520 0.6812169
## 0.55 0.0023343864 0.6141511 0.4941520 0.6666667
## 0.55 0.0233438645 0.6535389 0.4491228 0.7759259
## 1.00 0.0002334386 0.6103762 0.4941520 0.6740741
## 1.00 0.0023343864 0.6137110 0.4614035 0.6738095
## 1.00 0.0233438645 0.6561945 0.4385965 0.7976190
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.02334386.
ENmodelcvpredict<-predict(ENmodelcv,test_diabetic_largedata)
ENpredictCVconfiusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(ENmodelcvpredict,ordered=T))
ENpredictCVconfiusionmatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 14 9
## YES 9 26
##
## Accuracy : 0.6897
## 95% CI : (0.5546, 0.8046)
## No Information Rate : 0.6034
## P-Value [Acc > NIR] : 0.1125
##
## Kappa : 0.3516
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.6087
## Specificity : 0.7429
## Pos Pred Value : 0.6087
## Neg Pred Value : 0.7429
## Prevalence : 0.3966
## Detection Rate : 0.2414
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.6758
##
## 'Positive' Class : NO
##
ENmodelCVroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(ENmodelcvpredict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = NO, case = YES
## Setting direction: controls < cases
ENmodelCVroc
##
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted, ordered = T), predictor = factor(ENmodelcvpredict, ordered = T), plot = T, print.auc = TRUE)
##
## Data: factor(ENmodelcvpredict, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) NO) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) YES).
## Area under the curve: 0.6758
model_list <-append(model_list,"ENmodelcv")
accuracy_list<-append(accuracy_list,ENpredictCVconfiusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,ENpredictCVconfiusionmatrix$overall['Kappa'])
table(PCa_test_diabetic_largedata$readmitted)
##
## 0 1
## 23 35
table(PCa_train_diabetic_largedata$readmitted)
##
## 0 1
## 93 138
#Cross validation on PCA data set
PCa_train_diabetic_largedata$readmitted<-as.factor(ifelse(PCa_train_diabetic_largedata$readmitted==0,"NO","YES"))
PCa_test_diabetic_largedata$readmitted<-as.factor(ifelse(PCa_test_diabetic_largedata$readmitted==0,"NO","YES"))
table(PCa_train_diabetic_largedata$readmitted)
##
## NO YES
## 93 138
table(PCa_test_diabetic_largedata$readmitted)
##
## NO YES
## 23 35
# Define models to compare
models <- c("glm", "rf", "knn","svmRadial")
# Train and evaluate models
results <- lapply(models, function(model) {
train(readmitted ~ ., data = PCa_train_diabetic_largedata, method = model, trControl = TRC)
})
# Compare models using resamples()
resamples(results)
##
## Call:
## resamples.default(x = results)
##
## Models: Model1, Model2, Model3, Model4
## Number of resamples: 5
## Performance metrics: ROC, Sens, Spec
## Time estimates for: everything, final model fit
results
## [[1]]
## Generalized Linear Model
##
## 231 samples
## 20 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 184, 185, 185, 184, 186
## Resampling results:
##
## ROC Sens Spec
## 0.6446053 0.5339181 0.7251323
##
##
## [[2]]
## Random Forest
##
## 231 samples
## 20 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 184, 186, 184, 186, 184
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.5871786 0.2350877 0.8108466
## 11 0.5993974 0.3438596 0.7751323
## 20 0.5946270 0.3561404 0.7460317
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 11.
##
## [[3]]
## k-Nearest Neighbors
##
## 231 samples
## 20 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 185, 185, 185, 185, 184
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 5 0.5781746 0.4514620 0.6814815
## 7 0.5967697 0.3982456 0.7166667
## 9 0.6610833 0.4736842 0.7825397
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
##
## [[4]]
## Support Vector Machines with Radial Basis Function Kernel
##
## 231 samples
## 20 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 185, 184, 185, 186, 184
## Resampling results across tuning parameters:
##
## C ROC Sens Spec
## 0.25 0.6227041 0.2947368 0.8182540
## 0.50 0.6223143 0.2058480 0.8767196
## 1.00 0.5790093 0.1403509 0.9346561
##
## Tuning parameter 'sigma' was held constant at a value of 0.03162296
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.03162296 and C = 0.25.
#Cross validaion in PCA: The ROC of the models on the PCA data is not that high/ the best performance on the PCA data is the KNN by k=9 with the ROC 66% so I didn’t conclud them in my final model comparision
#Conclusion(cmparision of the models)
modelscompare<-data.frame(model_list,accuracy_list,kappa_list)
modelscompare
## model_list accuracy_list kappa_list
## 1 LRmodel 0.6206897 0.24137931
## 2 LRmodel2 0.6551724 0.33865450
## 3 PCALRmodel3 0.6206897 0.25205158
## 4 predictKNN 0.7758621 0.53514180
## 5 PCApredictKNN 0.7413793 0.44726811
## 6 DTmodel1 0.6206897 0.14362416
## 7 RFmodel1 0.7241379 0.41488020
## 8 XGboostclassifier 0.5344828 0.02002503
## 9 GLMmodelCV 0.6379310 0.26002430
## 10 RFmodelcv 0.6724138 0.28903226
## 11 KNNmodelCVF 0.7758621 0.53514180
## 12 SVMmodelCV 0.6724138 0.26631158
## 13 XGmodelCV 0.6034483 0.17755857
## 14 ENmodelcv 0.6896552 0.35155280
ggplot(modelscompare, aes(x = accuracy_list, y = model_list)) +
geom_bar(stat = "identity", aes(fill = kappa_list)) +scale_fill_gradient(low = 'red',high='green')+
xlab("Accuracy") +
ylab("models") +
ggtitle("Models comparision") +
theme(plot.title = element_text(hjust = 0.5))
ACUlist<-c(LRmodelroc$auc,LRmodel2roc$auc,LRmodel3roc$auc,ELmodelroc$auc,predictKNN.roc$auc,PCApredictKNN.roc$auc,SVMmodel1.roc$auc,SVMmodel2.roc$auc,DTpredict1.roc$auc,DTpredict2.roc$auc,RFpredict1.roc$auc,RFpredict2.roc$auc,GLMmodelCVroc$auc,RFMmodelCVroc$auc,KNNpredictCVroc$auc,SVMmodelCVroc$auc,XGmodelCVroc$auc,ENmodelCVroc$auc)
modelnames <- c("LRmodel", "LRmodel2", "LRmodel3", "ELmodel", "predictKNN", "PCApredictKNN", "SVMmodel1", "SVMmodel2", "DTpredict1", "DTpredict2", "RFpredict1", "RFpredict2", "GLMmodelCV", "RFMmodelCV", "KNNpredictCV", "SVMmodelCV", "XGmodelCV", "ENmodelCV")
acucompare<-data.frame(modelnames, ACUlist)
acucompare
## modelnames ACUlist
## 1 LRmodel 0.7043478
## 2 LRmodel2 0.7180124
## 3 LRmodel3 0.6521739
## 4 ELmodel 0.6993789
## 5 predictKNN 0.7695652
## 6 PCApredictKNN 0.7186335
## 7 SVMmodel1 0.5000000
## 8 SVMmodel2 0.5000000
## 9 DTpredict1 0.5664596
## 10 DTpredict2 0.5242236
## 11 RFpredict1 0.7043478
## 12 RFpredict2 0.7329193
## 13 GLMmodelCV 0.6329193
## 14 RFMmodelCV 0.6391304
## 15 KNNpredictCV 0.7695652
## 16 SVMmodelCV 0.6242236
## 17 XGmodelCV 0.5894410
## 18 ENmodelCV 0.6757764
ggplot(acucompare, aes(x = ACUlist, y = modelnames, fill = ACUlist)) +
geom_bar(stat = "identity") +
scale_fill_gradient2(low = "red", mid = "yellow", high = "green", midpoint = 0.65)
#Initially, I performed some data cleaning and preprocessing steps on the dataset. after that I end up containing 45 predictors and 289 observations. I dropped variables that contained missing values, such as weight or the ones that didn't contain any special information like some ids.Moreover, removed rows with missing values.
#Next, I bucketized the diagnostic columns based on a provided table and transformed all categorical variables into dummy variables. I then split the data into test and train sets , and subsequently scaled the entire dataset to prevent overshadowing effects. While I tried using PCA for dimensionality reduction, the presence of dummy variables caused that 20 principal components could cover 80% of the data, leading to only limited dimensionality reduction. Nonetheless, I kept the PCA data for evaluating the model performance.
#To evaluate the performance of the supervised learning models, I applied them to both the PCA and non-PCA datasets, and employed 5-fold cross-validation. As the outcome variable was not imbalanced, I considered accuracy evaluation metrics for comparing the models.As well as AUC and Kappa. For checking which same models fit better to data like evaluating several logestic rigressions I compare their Akaike as well.
#After comparing the performance of various models, I found that the KNN model performed best on the non-PCA dataset with an AUC of 0.7696 and a precision of 73%. Although my best model(KNN) is based on Euclidean distance and it is hard to say which predictor was more important or play more important role in predicting the readmition of the patient, Based on other models like random forest2(accuracy 72% and AUC 69%) and logestic2 regression (accuracy 65%, AUC 71%) I found "Number of lab procedures " and "time in hospitals" important variables